library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages -------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.1 v purrr 0.3.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidyr)
#library(rapportools)
Loading the train data
# Load train data - Replace blank cells with NA
df_x_main <- read.csv('airbnb_train_x.csv', na.strings = c("", "NA"))
df_y <- read.csv('airbnb_train_y.csv', na.strings = c("", "NA"))
df_x <- df_x_main
Loading the test data
tdf_x_main <- read.csv('airbnb_test_x.csv', na.strings = c("", "NA"))
tdf <- tdf_x_main
Check for missingness
# Check for missingness
colSums(is.na(df_x))
## X access
## 0 34436
## accommodates amenities
## 0 0
## availability_30 availability_365
## 0 0
## availability_60 availability_90
## 0 0
## bathrooms bed_type
## 246 1
## bedrooms beds
## 93 83
## cancellation_policy city
## 0 49
## city_name cleaning_fee
## 1 18324
## country country_code
## 1 7
## description experiences_offered
## 27 3
## extra_people first_review
## 0 8
## guests_included host_about
## 0 30812
## host_acceptance_rate host_has_profile_pic
## 91758 142
## host_identity_verified host_is_superhost
## 148 142
## host_listings_count host_location
## 145 411
## host_name host_neighbourhood
## 147 17233
## host_response_rate host_response_time
## 15794 15794
## host_since host_total_listings_count
## 142 142
## host_verifications house_rules
## 10 30393
## instant_bookable interaction
## 12 35880
## is_business_travel_ready is_location_exact
## 44551 19
## jurisdiction_names latitude
## 52690 10
## license longitude
## 85721 11
## market maximum_nights
## 433 10
## minimum_nights monthly_price
## 10 79436
## name neighborhood_overview
## 33 30743
## neighbourhood notes
## 13553 48533
## price property_type
## 19 20
## require_guest_phone_verification require_guest_profile_picture
## 19 19
## requires_license room_type
## 19 19
## security_deposit smart_location
## 40648 20
## space square_feet
## 21045 98419
## state street
## 22 19
## summary transit
## 2964 29167
## weekly_price zipcode
## 75895 1049
There are a lot of columns with missing data but it’s easier to understand using proportions
missing.values <- df_x_main %>%
gather(key = "key", value = "val") %>%
mutate(isna = is.na(val)) %>%
group_by(key) %>%
mutate(total = n()) %>%
group_by(key, total, isna) %>%
summarise(num.isna = n()) %>%
mutate(pct = num.isna / total * 100)
## Warning: attributes are not identical across measure variables;
## they will be dropped
levels <-
(missing.values %>% filter(isna == T) %>% arrange(desc(pct)))$key
percentage.plot <- missing.values %>%
ggplot() +
geom_bar(aes(x = reorder(key, desc(pct)),
y = pct, fill=isna),
stat = 'identity', alpha=0.8) +
scale_x_discrete(limits = levels) +
scale_fill_manual(name = "",
values = c('steelblue', 'tomato3'), labels = c("Present", "Missing")) +
coord_flip() +
labs(title = "Percentage of missing values", x =
'Variable', y = "% of missing values")
percentage.plot
## Warning: Removed 10 rows containing missing values (position_stack).
We can also see how the rows are -
row.plot <- df_x %>%
mutate(id = row_number()) %>%
gather(-id, key = "key", value = "val") %>%
mutate(isna = is.na(val)) %>%
ggplot(aes(key, id, fill = isna)) +
geom_raster(alpha=0.8) +
scale_fill_manual(name = "",
values = c('steelblue', 'tomato3'),
labels = c("Present", "Missing")) +
scale_x_discrete(limits = levels) +
labs(x = "Variable",
y = "Row Number", title = "Missing values in rows") +
coord_flip()
## Warning: attributes are not identical across measure variables;
## they will be dropped
row.plot
## Warning: Removed 1000000 rows containing missing values (geom_raster).
In dealing with the missingness, let’s first create missingness indicators for all the variables. For this, we can define a function. We shall use this once we are sure we’ve made a fair amount of cleaning to most of the columns.
appendNAs <- function(dataset, cols) {
append_these = data.frame( is.na(dataset[, cols] ))
names(append_these) = paste(names(append_these), "NA", sep = "_")
dataset = cbind(dataset, append_these)
dataset[is.na(dataset)] = -1
return(dataset)
}
A glimpse at the structure of the data
glimpse(df_x)
## Rows: 100,000
## Columns: 70
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,...
## $ access <fct> "I like for our guests to feel rig...
## $ accommodates <fct> 12, 1, 10, 4, 4, 3, 6, 1, 2, 2, 1,...
## $ amenities <fct> "{TV,\"Cable TV\",Internet,\"Wirel...
## $ availability_30 <fct> 16, 30, 1, 13, 14, 3, 4, 29, 2, 0,...
## $ availability_365 <int> 305, 365, 330, 150, 39, 3, 131, 36...
## $ availability_60 <fct> 33.0, 60.0, 25.0, 32.0, 27.0, 3.0,...
## $ availability_90 <fct> 57, 90, 55, 62, 39, 3, 44, 87, 62,...
## $ bathrooms <fct> 2.5, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0,...
## $ bed_type <fct> Real Bed, Real Bed, Real Bed, Real...
## $ bedrooms <fct> 4.0, 1.0, 4.0, 1.0, 2.0, 1.0, 2.0,...
## $ beds <fct> 6.0, 1.0, 6.0, 2.0, 2.0, 2.0, 3.0,...
## $ cancellation_policy <fct> strict, flexible, strict, moderate...
## $ city <fct> Nashville, Los Angeles, San Diego,...
## $ city_name <fct> Nashville, Los Angeles, San Diego,...
## $ cleaning_fee <fct> $250.00, NA, $50.00, $45.00, $75.0...
## $ country <fct> United States, United States, Unit...
## $ country_code <fct> US, US, US, US, US, US, US, US, US...
## $ description <fct> "Vacation in this Nashville home f...
## $ experiences_offered <fct> none, none, none, none, none, none...
## $ extra_people <fct> $0.00, $0.00, $0.00, $20.00, $50.0...
## $ first_review <fct> 2016-12-18, 2015-07-17, 2016-05-08...
## $ guests_included <dbl> 8, 1, 1, 2, 4, 2, 5, 1, 1, 1, 1, 1...
## $ host_about <fct> NA, NA, "I am from Virginia, USA a...
## $ host_acceptance_rate <fct> NA, NA, 100%, NA, NA, NA, NA, NA, ...
## $ host_has_profile_pic <fct> t, t, t, t, t, t, t, t, t, t, t, t...
## $ host_identity_verified <fct> f, t, t, t, t, t, t, t, t, t, f, t...
## $ host_is_superhost <fct> t, f, f, t, t, t, t, f, f, f, f, t...
## $ host_listings_count <fct> 1.0, 1.0, 14.0, 1.0, 1.0, 5.0, 7.0...
## $ host_location <fct> "Nashville, Tennessee, United Stat...
## $ host_name <fct> May & Eric, Jonathon, Jeremy, Gene...
## $ host_neighbourhood <fct> NA, Silver Lake, East Village, Nea...
## $ host_response_rate <fct> 100%, NA, 100%, 100%, 100%, 100%, ...
## $ host_response_time <fct> within an hour, NA, within an hour...
## $ host_since <fct> 2015-09-03, 2015-07-07, 2013-07-20...
## $ host_total_listings_count <fct> 1.0, 1.0, 14.0, 1.0, 1.0, 5.0, 7.0...
## $ host_verifications <fct> "['email', 'phone', 'google', 'rev...
## $ house_rules <fct> NA, NA, "No pets, parties, or indo...
## $ instant_bookable <fct> t, f, t, t, f, f, f, f, f, f, t, f...
## $ interaction <fct> "Don't hesitate to call or text an...
## $ is_business_travel_ready <fct> t, NA, NA, NA, f, t, t, f, f, f, f...
## $ is_location_exact <fct> f, t, t, t, t, t, t, t, t, t, t, t...
## $ jurisdiction_names <fct> NA, "City of Los Angeles, CA", "SA...
## $ latitude <fct> 36.1467563677, 34.0842597547, 32.7...
## $ license <fct> NA, NA, NA, NA, 17STR-04969, 10759...
## $ longitude <fct> -86.81720471780001, -118.273509113...
## $ market <fct> Nashville, Los Angeles, San Diego,...
## $ maximum_nights <dbl> 365, 1125, 1125, 1125, 1125, 10, 1...
## $ minimum_nights <int> 2, 1, 3, 2, 2, 2, 3, 2, 2, 3, 1, 3...
## $ monthly_price <fct> NA, NA, NA, NA, "$2,250.00", NA, N...
## $ name <fct> "BRAND NEW ! NASHVILLE LOFT", "Sil...
## $ neighborhood_overview <fct> "This modern loft is locate in a n...
## $ neighbourhood <fct> NA, Silver Lake, Pacific Beach, Ne...
## $ notes <fct> NA, NA, "Its a fun beach town - th...
## $ price <fct> $375.00, $175.00, $499.00, $145.00...
## $ property_type <fct> Loft, House, House, House, Apartme...
## $ require_guest_phone_verification <fct> f, f, f, f, f, f, f, f, f, f, f, f...
## $ require_guest_profile_picture <fct> f, f, f, f, f, f, f, f, f, f, f, f...
## $ requires_license <fct> f, f, f, f, t, t, t, f, f, f, f, f...
## $ room_type <fct> Entire home/apt, Private room, Ent...
## $ security_deposit <fct> "$500.00", NA, NA, NA, "$150.00", ...
## $ smart_location <fct> "Nashville, TN", "Los Angeles, CA"...
## $ space <fct> "Beautiful brand new loft in a cen...
## $ square_feet <int> NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ state <fct> TN, CA, CA, DC, LA, CA, LA, NY, NY...
## $ street <fct> "Nashville, TN, United States", "S...
## $ summary <fct> "Vacation in this Nashville home f...
## $ transit <fct> NA, NA, "Ubers are everywhere. We ...
## $ weekly_price <fct> NA, NA, NA, NA, $700.00, NA, NA, $...
## $ zipcode <fct> 37203, 90026, 92109.0, 20002, 7011...
Inspect the columns that have numeric data -
1 accommodates Factor 2 availability_30 Factor 3 availability_365 int 4 availability_60 Factor 5 availability_90 Factor 6 bathrooms Factor 7 bedrooms Factor 8 beds Factor 9 cleaning_fee Factor - (needs to be cleaned - has $ symbol like $250.00) 10 extra_people Factor - (needs to be cleaned - has $ symbol like $250.00) 11 guests_included num 12 host_acceptance_rate Factor - (needs to be cleaned - has format like so - 100%) 13 host_listings_count Factor 14 host_response_rate Factor - (needs to be cleaned - has format like so - 100%) 15 host_total_listings_count Factor 16 latitude Factor 17 longitude Factor 18 maximum_nights num 19 minimum_nights int 20 monthly_price Factor - (needs to be cleaned - has $ symbol like $250.00) 21 price Factor - (needs to be cleaned - has $ symbol like $250.00) 22 security_deposit Factor - (needs to be cleaned - has $ symbol like $250.00) 23 square_feet int 24 weekly_price Factor - (needs to be cleaned - has $ symbol like $250.00)
First let’s explore the columns with the $ symbols and commas and percentages -
dollar_num_cols = c("cleaning_fee", "extra_people", "monthly_price", "price", "security_deposit", "weekly_price")
perc_num_cols = c("host_acceptance_rate", "host_response_rate")
# The columns which have prices
qplot(cleaning_fee, data = df_x) + coord_flip()
qplot(extra_people, data = df_x) + coord_flip()
qplot(monthly_price, data = df_x) + coord_flip()
qplot(price, data = df_x) + coord_flip()
qplot(security_deposit, data = df_x) + coord_flip()
qplot(weekly_price, data = df_x) + coord_flip()
# Observations
# cleaning_fee
# - Majority of NAs
# - Has a few f and t
# extra_people
# - has some longitude/latitude information -
# - 40.7115296729, 37.7172817854, 34.0550373047, 34.0544381605, 34.04703657, 34.0305066944, 33.9847962097, 33.7751358179, 30.3646152573
# price
# -looks fine except for a very small number of NAs
# security_deposit
# mostly NAs
# second most popular category is $0.00
# weekly_price
# - looks fine
# - mostly NAs
# monthly_price
# - NA is the most popular - there are lots
# - other values are mostly 4 digits or more with very few 3 digits
# The columns which have %
qplot(host_acceptance_rate, data = df_x) + coord_flip()
qplot(host_response_rate, data = df_x) + coord_flip()
# host_acceptance_rate
# - most popular category - NA
# - all others are within format
# host_response_rate
# - 100% is most popular
# - Next is NA
# - There are 4 out of place categories - Apartment, Loft, House, Condominium
Storing the discrepancies in a variable for later use -
extra_people_discrepancies = c(40.7115296729, 37.7172817854, 34.0550373047, 34.0544381605, 34.04703657, 34.0305066944, 33.9847962097, 33.7751358179, 30.3646152573)
hostresponserate_discrepancies = c("Apartment", "Loft", "House", "Condominium")
df_x[df_x$extra_people %in% extra_people_discrepancies,]
## X access accommodates amenities availability_30 availability_365
## 16246 16246 <NA> t t f 1
## 30584 30584 <NA> t t t 2
## 47615 47615 <NA> t t f 1
## 56281 56281 <NA> t t t 2
## 65792 65792 <NA> t f f 1
## 72540 72540 <NA> t t f 5
## 75208 75208 <NA> t t t 2
## 92585 92585 <NA> t t t 1
## 96068 96068 <NA> t t f 1
## availability_60 availability_90
## 16246 Long Beach, California, United States Manuel
## 30584 Santa Monica, California, United States Medi
## 47615 New York, New York, United States David
## 56281 Los Angeles, California, United States Sharlette
## 65792 San Francisco, California, United States Winni
## 72540 Los Angeles, California, United States Ryan
## 75208 Los Angeles, California, United States Sharlette
## 92585 Austin, Texas, United States Amanda
## 96068 Los Angeles, California, United States Moe
## bathrooms bed_type bedrooms beds cancellation_policy
## 16246 Long Beach 100% within an hour 2016-07-18 1.0
## 30584 Del Rey 100% within an hour 2011-07-18 2.0
## 47615 Williamsburg 100% within a day 2011-09-21 1.0
## 56281 Mid-Wilshire 100% within an hour 2014-12-26 2.0
## 65792 Oceanview <NA> <NA> 2015-06-25 1.0
## 72540 Downtown 81% within an hour 2013-01-20 5.0
## 75208 Mid-Wilshire 100% within an hour 2014-12-26 2.0
## 92585 North Shoal Creek 100% within an hour 2014-09-12 1.0
## 96068 <NA> 100% within an hour 2015-06-30 1.0
## city
## 16246 ['email', 'phone', 'facebook', 'reviews', 'jumio', 'government_id']
## 30584 ['email', 'phone', 'facebook', 'reviews', 'kba']
## 47615 ['email', 'phone', 'facebook', 'reviews', 'jumio']
## 56281 ['email', 'phone', 'google', 'reviews', 'kba']
## 65792 ['email', 'phone', 'reviews']
## 72540 ['email', 'phone', 'reviews', 'jumio', 'work_email']
## 75208 ['email', 'phone', 'google', 'reviews', 'kba']
## 92585 ['email', 'phone', 'reviews', 'kba']
## 96068 ['email', 'phone', 'reviews', 'jumio']
## city_name
## 16246 - Keep kitchen, kitchenware and bathroom clean at all times with no exceptions. - Kitchen is strictly close from 10:00 pm to 6:00 am. - No smoking in the building. - No unregistered overnight guest. - There is a friendly dog in my home; so please be pet friendly.
## 30584 No smoking and no pets please.
## 47615 <NA>
## 56281 We hope you are a dog lover. We have two Goldendoodle puppies who love humans. Quiet hours are from 10pm-7am
## 65792 No loud noise after 9pm. Guests are responsible for emptying the garbage during the stay.
## 72540 NOTE: we've been having a lot of people book with the intentions of using our home for photoshoots, video, parties corporate events without telling us. Please do not use instant book for a shoot or party without email first, doing so will result in your forfeiting the rental fee or deposit. Our rates are completely different for this. Please be upfront with what you're looking to do and we will be very cool! If we do find that you have broken these rules, and we absolutely will, you will be asked to leave and we will not refund your stay Thanks so much :) -no additional people in the unit than the number you booked for any reason. -new policy for the building, please no roof usage. -Please no parties, as this is a residential building. There is a $5000 fee for unauthorized parties in our building. -The rooftop is accessible via the bedroom, be aware that the trap door takes a bit of muscle to open. Walking on the roof above the neighbors' lofts is strictly prohibited. If the fore
## 75208 We hope you are a dog lover. We have two large Goldendoodles who love humans. Quiet hours are 10 pm- 7am. Thank you.
## 92585 â\200¢ Check-out time is 12 pm and check-in time is 2PM or later. I'm very sensitive to cigarette smoke, so please no smokers. If you must smoke, please do so in the front yard and change your shirt when you come inside. Thank you for your understanding. Please don't feed my animals during your stay unless I'm with you. Also, please don't teach my parrot any naughty words, and don't touch or handle him unless I am present. This is for your safety. While I love animals, I can't accept any animals in my home. This is to both keep my animals safe, and to keep my home hair free (I'm allergic to dogs and many of my guests and students are allergic to cats.) Although if you want to bring your beta fish, that's fine (yes, I did get that request). Please clean up after yourself when you use the common areas during your stay (kitchen, living room, bathrooms, porch, studio).
## 96068 This listing can accommodate 2 adults, extra fee after 2 people. No smoking, No party after 10 pm. I love my neighbors. I would like my guests to respect them.
## cleaning_fee
## 16246 f
## 30584 f
## 47615 f
## 56281 f
## 65792 f
## 72540 f
## 75208 f
## 92585 f
## 96068 t
## country
## 16246 Basically, I am out most of the day because I am working and attending school.
## 30584 Free free to contact us anytime! We'd love to show you around if you're not familiar with the area. We'll be glad to give recommendations and we'll most likely be in town (just a few minutes away in Santa Monica) during your stay.
## 47615 We will not be on premises however in the city. We will be on call if any issues arise.
## 56281 <NA>
## 65792 å\217¯ä»£å®šç§Ÿè½¦,è´ä¹°é—¨ç¥¨æ\210–旅行其他咨询
## 72540 We will be here to show you around when you arrive but the place is yours unless we get noise complaints. There is very little noise insulation in the building, so on the main floor of the loft you must keep noise to a minimum. The upstairs floor is more insulated so you can hang out, watch movies or play video games in the theater if you plan to be up late.
## 75208 There are a lot of good restaurants around the area, things to explore... I can also give recommendations if you're new to the area! Ask away and I'll help you maximize your time and have fun!
## 92585 I'm an oil painter who works from home so I will usually be available for my guests when needed, but I also respect your privacy and try to stay quietly out of the way.
## 96068 I will meet you upon arrival, and I am available to chat whenever you need me. On your departure you will meet me or a trusted friend/neighbor.
## country_code description experiences_offered extra_people
## 16246 <NA> t <NA> 33.7751358179
## 30584 <NA> t City of Los Angeles, CA 33.9847962097
## 47615 f t <NA> 40.7115296729
## 56281 <NA> t City of Los Angeles, CA 34.0550373047
## 65792 f t {"SAN FRANCISCO"} 37.7172817854
## 72540 <NA> t City of Los Angeles, CA 34.0305066944
## 75208 <NA> t City of Los Angeles, CA 34.0544381605
## 92585 <NA> t <NA> 30.3646152573
## 96068 <NA> f City of Los Angeles, CA 34.04703657
## first_review guests_included host_about host_acceptance_rate
## 16246 <NA> -118.18485 Los Angeles 1125
## 30584 <NA> -118.41256 Los Angeles 99
## 47615 <NA> -73.96088 New York 1125
## 56281 <NA> -118.29951 Los Angeles 60
## 65792 STR-0000408 -122.45269 San Francisco 28
## 72540 <NA> -118.24847 Los Angeles 1125
## 75208 <NA> -118.29775 Los Angeles 1125
## 92585 <NA> -97.73292 Austin 1125
## 96068 <NA> -118.46222 Los Angeles 120
## host_has_profile_pic host_identity_verified
## 16246 2 <NA>
## 30584 22 $2,960.00
## 47615 2 <NA>
## 56281 2 <NA>
## 65792 3 <NA>
## 72540 1 $8,500.00
## 75208 2 <NA>
## 92585 2 <NA>
## 96068 3 $3,200.00
## host_is_superhost
## 16246 Modern, intimate condo in Downtown Long Beach
## 30584 Amazing Retreat w/ Jacuzzi and Pool
## 47615 Brooklyn Style Williamsburg 2 BR Apartment
## 56281 Koreatown sunlit room
## 65792 Rooms with private entrance 独立ä¸\200æ\210¿ä¸\200厅
## 72540 HUGE 3,500 sq/ft Downtown Los Angeles Loft
## 75208 Private Koreatown Master Bedroom
## 92585 ART, ANIMALS & NATURE! Private room
## 96068 Penthouse Oceanview/UCLA/Parking
## host_listings_count
## 16246 Long Beach Downtown (LBDT) is an amazing place to stay, a few blocks away from the beach, Shoreline Village, Pike, The Queen Mary and connected with all the main attractions of Los Angeles such as Hollywood, Universal Studios, Disneyland, Knott's Berry Farm, LA Live, Staple Center, Dodgers Stadium, Angels Stadium through shuttles, train, and buses if you are coming for entertainment. On the other hand if you are going to stay for business or professional studies, LBDT is close to CSULB as well to both campuses of Long Beach Community College, the port of Long Beach, Long Beach Convention Center.
## 30584 A few minutes away are Marina Del Rey (where you can go yachting, kayaking, fine dining, etc), Playa Del Rey beach, and the only beach you can enjoy bonfires in LA- Dockweiler. The Runway shopping mall just opened.. Restaurants, Whole foods market, shopping, parks, and entertainment are only a mile away! The place is conveniently located with easy commutes to the South Bay citites and the rest of LA! Bike paths are steps away where you could easily bike through the wetlands and to the beach! Plus, there are two reserved parking spaces in addition to many more parking always available for guests. Truly a gem :)
## 47615 We are surrounded by many independent coffee shops, fun bars and a range of different lively restaurants with many different cuisines.
## 56281 <NA>
## 65792 <NA>
## 72540 Located on the edge of the Fashion District, we are on quite possibly the quietest block in Downtown Los Angeles. The loft is about a $4 Uber ride from the city center where you will find thousands of restaurants and bars within walking distance.
## 75208 <NA>
## 92585 My neighborhood is closely located to the Domain and the Arboretum area with loads of shopping and dining options. I'm also closely located to two Whole Foods Markets. I'm within walking distance of many dining and entertainment options, including The Alamo Drafthouse Village.
## 96068 Some of the best restaurants of Los Angeles is in walking distance. Chinese, Persian, Thai, Vietnamese, Berger, Organic, Vegan, Vegeterian, Mexican and more. There are few bars walking distance to me.
## host_location
## 16246 Long Beach
## 30584 Del Rey
## 47615 Williamsburg
## 56281 Mid-Wilshire
## 65792 Oceanview
## 72540 Downtown
## 75208 Mid-Wilshire
## 92585 North Shoal Creek
## 96068 <NA>
## host_name
## 16246 <NA>
## 30584 Speedy internet connection, with cable and Netflix on HD tv! Appliances and cookware include: coffee maker, microwave, stainless steel cookware, coffee maker, toaster, pots, water boiler and knives set. For your convenience: iron, iron board, hair dryer, extra beach/pool towels, crib, pillows, sheets, plush queen air mattress, and spacious walk-in closet with more than enough room to put your things :)
## 47615 <NA>
## 56281 There are a lot of good restaurants around the area, things to explore... I can also give recommendations if you're new to the area! Ask away and I'll help you maximize your time and have fun!
## 65792 <NA>
## 72540 Take a one mile car ride to some of the finest dining and cocktails on the west coast, or keep it über local. Daniel Benhaim, internationally renowned chef, who personally sources his meat and produce from local organic farms, throws private dinner parties next door 3 or 4 times a month. Look up his menu and calendar by googling his name and "because the wind." It is a magical evening where the beautiful people show their faces and eat Daniel's decadent art.
## 75208 <NA>
## 92585 I teach from my home studio on Tuesday, Wednesday and Friday evenings and Saturdays during the day, but this will not interfere with your living space. On most Friday evenings from 8-10 pm I host an art club meeting which will sometimes involve using the living room to watch a movie. I have a boyfriend who travels a lot so he usually isn't here, but sometimes he is. He is very friendly and respectful. Check out time is 12:00 pm, and check in is 2:00 pm.
## 96068 <NA>
## host_neighbourhood host_response_rate host_response_time host_since
## 16246 $63.00 Condominium f f
## 30584 $153.00 Apartment f f
## 47615 $195.00 Apartment f f
## 56281 $65.00 Apartment f f
## 65792 $195.00 House f f
## 72540 $445.00 Loft f f
## 75208 $75.00 Apartment f f
## 92585 $40.00 House f f
## 96068 $113.00 Apartment f f
## host_total_listings_count host_verifications house_rules
## 16246 f 100.0 Private room
## 30584 f 90.0 Entire home/apt
## 47615 f 80.0 Entire home/apt
## 56281 f 90.0 Private room
## 65792 t 98.0 Private room
## 72540 f 90.0 Entire home/apt
## 75208 f 100.0 Private room
## 92585 f 97.0 Private room
## 96068 f 88.0 Entire home/apt
## instant_bookable interaction
## 16246 <NA> Long Beach, CA
## 30584 $250.00 Los Angeles, CA
## 47615 $250.00 Brooklyn, NY
## 56281 $100.00 Los Angeles, CA
## 65792 $500.00 San Francisco, CA
## 72540 <NA> Los Angeles, CA
## 75208 $100.00 Los Angeles, CA
## 92585 $100.00 Austin, TX
## 96068 $200.00 West Los Angeles, CA
## is_business_travel_ready
## 16246 Roomie cozy bedroom (195 square ft) with a queen size bed, tv, a chair, AC, heater and coin laundry in the basement of the building.
## 30584 Completely renovated! 800 Sq feet -Luxurious jacuzzi and marble stand-alone shower -Hardwood flooring -Marble flooring in kitchen and bathroom -2 parking spaces (not tandem) -Stainless steel fridge -Dishwasher -All ceiling lights have dimmer switches -Gorgeous hardwood ceilings -Triple paned windows and balcony doors -Balcony looks out to the pool -Double lounger for relaxing on the balcony -Internet and Wifi -Cable and Netflix on HD tv -Comfy bed and mattress -We provide bedding and towels
## 47615 We are happy to share our 2 bedroom apartment that we call home. There are two good sized bedrooms both with Queen beds and sunlight throughout the day. There is a modern kitchen, large flat screen television with Verizon Cable/Apple TV, wifi internet, modern bathroom and a large communal rooftop with breathtaking Manhattan Skyline views. Also decorated by original artwork from our friend @lucasgeorpaints
## 56281 <NA>
## 65792 Close to Bart, easy to get to downtown
## 72540 It's a breathtaking space. Original hardwood floors and exposed brick, soaring 20' ceilings, a floating staircase, mid-century art, a beautiful reclaimed wood dining table, geometric metal sculptures, sequoia stump side tables, a CA King size Tempurpedic bed, and a trap door from the bedroom leading to a rooftop view worthy of Apple's recent iPhone 6 commercial, which was filmed here. The industrial neighborhood is quiet and empty, though a bit dirty, but everything cute in DTLA is only about 1 mile away, which makes ours the best view of the skyline. We also have a 300 megabit internet connection.
## 75208 The furniture is brand new and thoughtfully planned for this master bedroom. I've purchased a new queen sized memory foam mattress for your comfort.
## 92585 This charming home is located in the friendly neighborhood of North Shoal Creek is perfect for anyone who loves art, animals and nature. I have two happy, free-ranging chickens (Heraldine and Ruby) who lay fresh organic and soy-free eggs daily for your consumption. There are two outdoor snugly rabbits (Darwin and Pearl) who will gladly accept your ear rubs. And the king of the house is my adorable African Grey parrot, Jack. He may or may not let you scratch his head, but he is certain to entertain with his large vocabulary and sense of humor. My backyard is lush and green most of the year, and the lily pond is gorgeous when in bloom, which attracts a host of native birds and wildlife. Don't worry about those Texas mosquitoes, because I have a lovely screened in porch complete with comfy furniture and a hammock for you to nap or lounge in while safe from pesky insects. My kitchen is fully stocked with everything you need to make a meal, including a bread machine, Super Angel Juicer, cr
## 96068 The space is a large studio apartment with hip to ceiling windows. Tow sides are windows and two walls are mirrored. Sun enters the house as she rises and leaves the house as it sets. The views are spectacular! Views of the ocean and pool from one window and the Hill's and downtown Beverly Hills from the other. Ocean breeze comes in the house if you open one window on the ocean side and the other one in the kitchen. That feels you are in Heaven. Welcome to the Penthouse!
## is_location_exact jurisdiction_names
## 16246 <NA> CA
## 30584 <NA> CA
## 47615 <NA> NY
## 56281 <NA> CA
## 65792 <NA> CA
## 72540 <NA> CA
## 75208 <NA> CA
## 92585 <NA> TX
## 96068 <NA> CA
## latitude
## 16246 Long Beach, Long Beach, CA 90813, United States
## 30584 Del Rey, Los Angeles, CA 90066, United States
## 47615 Brooklyn, NY, United States
## 56281 Mid-Wilshire, Los Angeles, CA 90006, United States
## 65792 San Francisco, CA, United States
## 72540 Downtown, Los Angeles, CA 90021, United States
## 75208 Mid-Wilshire, Los Angeles, CA 90006, United States
## 92585 Briarwood Lane, Austin, TX 78757, United States
## 96068 West Los Angeles, CA 90025, United States
## license
## 16246 My place is in Long Beach downtown. Close to Pike, Shoreline Village, Queen Mary, Belmont Shore, and just around the corner from the night life scene.. Youâ\200\231ll love my place because of the comfy bed, the coziness, and the high ceilings. My place is good for couples, solo adventurers, and business travelers.
## 30584 Completely, beautifully renovated from floor to ceiling! Located in the heart of Westside- mins to Marina Del Rey, LMU, Playa Vista, Playa Del Rey, Mar Vista, Culver City, Venice Beach, Santa Monica. With 2 parking spaces. Very quiet complex with pool, hot tub, laundry onsite, bike access to the beach. Fully stocked kitchen with cookware. Family friendly for toddlers with new crib, high chair, and bath. HD television and high speed internet access.
## 47615 Two bedroom apartment in the middle of all Williamsburg's amazing restaurant, bar and cafe scene. 5 mins into Manhattan - between Marcy Ave JMZ line and Bedford L.
## 56281 WELCOME HOME! Our spacious room is well lit with a city feel. Close to The Grove, Downtown LA, Santa Monica, Beverly Hills, Silverlake. As this is your home away from home, I will ensure you feel comfortable.
## 65792 Very nice rooms in a convenient location, close to Bart, Muni and highways. 15 minutes drive from the airport or to downtown SF. One queen bed and one futon that can occupy up to 4 people.
## 72540 Spacious and beautiful, You will love this 3,500sqft loft. Located in an early 1900s industrial building, it features exposed brick, 25' high ceilings, Hue lights, a theatre with a 100"screen, Apple TV, steinway grand piano and more!
## 75208 Welcome Home! Our spacious room is well lit with a city feel! - Close to The Grove, Koreatown Plaza, Downtown LA, Santa Monica, Beverly Hills, Westwood - Convenient public transportation & most places are a cheap UBER ride away. I will provide FREE COFFEE/TEA/CHOCOLATE and clean sheets + towels + robe for you.
## 92585 This is the perfect place for art, animal and nature lovers in a charming and convenient neighborhood, with a well-traveled and friendly host. :)
## 96068 Gorgeous Ocean/Sunset view, Gated,24 hour security, parking,Beautiful Olympic pool,Gym,Walk to Brentwood restaurants, bars.Great Public Transport The studio has sofa-bed and private bathroom w/shower. 10 minutes to Santa Monica pier, 3rd street,UCLA.
## longitude
## 16246 The place is a couple of blocks away from the blue line, Long Beach Transit Mall, and close to 710 and 405 freeways.
## 30584 <NA>
## 47615 There are various transportation options. We are at the footstep of the Williamsburg bridge so an Uber/Taxi into Manhattan is only 5 mins. We are between the Marcy Ave (JMZ) and Bedford Ave (L) MTA Stops which are both one stop into Manhattan. Both stops are around 5mins away. There is also a citi bike rack at the end of our street.
## 56281 Public transit is 3 blocks away if you want to get around LA via metro! Very fast and convenient way of traveling. 10 min walk to the Wiltern 10 min walk to the Purple Line (Wilshire/Normandie station) 5 min drive to Downtown LA/Staples Center 10 min drive to Hollywood Blvd. 20 min drive to Santa Monica and Venice beach and UCLA 25 min drive to LAX. There is street parking in my neighborhood, and no dedicated off-street parking space. We hope you understand that parking may be hard to find right away especially late at night, as it is a densely populated street. Lastly, our apartment is on the second floor!
## 65792 10 minutes walk to Bart station that can take you to downtown or airport. Muni M line two blocks away.
## 72540 Uber, Lyft and Sidecar are readily available to take you the 1 mile to the center of Downtown Los Angeles, where your entertainment possibilities are literally endless.
## 75208 Public transit is 3 blocks away if you want to get around LA via metro! Very fast and convenient way of traveling. 10 min walk to the Wiltern 10 min walk to the Purple Line (Wilshire/Normandie station) 5 min drive to Downtown LA/Staples Center 10 min drive to Hollywood Blvd. 20 min drive to Santa Monica and Venice beach and UCLA 25 minutes driving to LAX. There is street parking if you need to park your car. As it is a densely populated street, parking may be hard to find. We are accepting guests who are using primarily Uber or public transportation.
## 92585 ****UPDATE: Uber and Lyft are temporarily suspended in Austin. Other options are GetMe, Wingz, Fare and RideAustin. I can't attest to their reliability or price as I have not yet used them. However, if prior arrangements are made and I am available, I will happily take you to and from the airport and drop you off around Austin for "donations" that you deem fair (please take into account my time and distance driving to and from your drop off place). My house is about a 10-15 minute ride from almost anywhere in Austin, and about a 30 minute drive from the airport unless it's rush hour traffic. There is a bus stop a block away that will take you downtown in about 40 minutes. I also have a bicycle available to rent for $10 a day.
## 96068 If you are not driving...Public transportation is right at the corner, the bus schedules are easy to access online. These buses will get you to the beach in Venice and Santa Monica or even as far as Downtown Los Angeles or Malibu. You can take the bus to Santa Monica Beach in 15-20 minutes (3 miles away).
## market maximum_nights minimum_nights monthly_price name
## 16246 <NA> 90813 0 <NA> <NA>
## 30584 <NA> 90066 0 <NA> <NA>
## 47615 <NA> 11211 0 <NA> <NA>
## 56281 <NA> 90006 0 <NA> <NA>
## 65792 $1,100.00 94112 0 <NA> <NA>
## 72540 $2,999.00 90021 0 <NA> <NA>
## 75208 <NA> 90006 0 <NA> <NA>
## 92585 <NA> 78757 1 <NA> <NA>
## 96068 $750.00 90025 0 <NA> <NA>
## neighborhood_overview neighbourhood notes price property_type
## 16246 <NA> <NA> <NA> <NA> <NA>
## 30584 <NA> <NA> <NA> <NA> <NA>
## 47615 <NA> <NA> <NA> <NA> <NA>
## 56281 <NA> <NA> <NA> <NA> <NA>
## 65792 <NA> <NA> <NA> <NA> <NA>
## 72540 <NA> <NA> <NA> <NA> <NA>
## 75208 <NA> <NA> <NA> <NA> <NA>
## 92585 <NA> <NA> <NA> <NA> <NA>
## 96068 <NA> <NA> <NA> <NA> <NA>
## require_guest_phone_verification require_guest_profile_picture
## 16246 <NA> <NA>
## 30584 <NA> <NA>
## 47615 <NA> <NA>
## 56281 <NA> <NA>
## 65792 <NA> <NA>
## 72540 <NA> <NA>
## 75208 <NA> <NA>
## 92585 <NA> <NA>
## 96068 <NA> <NA>
## requires_license room_type security_deposit smart_location space
## 16246 <NA> <NA> <NA> <NA> <NA>
## 30584 <NA> <NA> <NA> <NA> <NA>
## 47615 <NA> <NA> <NA> <NA> <NA>
## 56281 <NA> <NA> <NA> <NA> <NA>
## 65792 <NA> <NA> <NA> <NA> <NA>
## 72540 <NA> <NA> <NA> <NA> <NA>
## 75208 <NA> <NA> <NA> <NA> <NA>
## 92585 <NA> <NA> <NA> <NA> <NA>
## 96068 <NA> <NA> <NA> <NA> <NA>
## square_feet state street summary transit weekly_price zipcode
## 16246 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 30584 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 47615 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 56281 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 65792 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 72540 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 75208 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 92585 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 96068 NA <NA> <NA> <NA> <NA> <NA> <NA>
All the columns seem to be misplaced. We can store this separately and rearrange them -
misplacedX <- c(16246, 30584, 47615, 56281, 65792, 72540, 75208, 92585, 96068)
misplaced_df_x <- df_x[df_x$extra_people %in% extra_people_discrepancies,]
df_x[df_x$extra_people %in% extra_people_discrepancies, 3:70] <- NA
df_x[df_x$X %in% misplacedX, ]
## X access accommodates amenities availability_30 availability_365
## 16246 16246 <NA> <NA> <NA> <NA> NA
## 30584 30584 <NA> <NA> <NA> <NA> NA
## 47615 47615 <NA> <NA> <NA> <NA> NA
## 56281 56281 <NA> <NA> <NA> <NA> NA
## 65792 65792 <NA> <NA> <NA> <NA> NA
## 72540 72540 <NA> <NA> <NA> <NA> NA
## 75208 75208 <NA> <NA> <NA> <NA> NA
## 92585 92585 <NA> <NA> <NA> <NA> NA
## 96068 96068 <NA> <NA> <NA> <NA> NA
## availability_60 availability_90 bathrooms bed_type bedrooms beds
## 16246 <NA> <NA> <NA> <NA> <NA> <NA>
## 30584 <NA> <NA> <NA> <NA> <NA> <NA>
## 47615 <NA> <NA> <NA> <NA> <NA> <NA>
## 56281 <NA> <NA> <NA> <NA> <NA> <NA>
## 65792 <NA> <NA> <NA> <NA> <NA> <NA>
## 72540 <NA> <NA> <NA> <NA> <NA> <NA>
## 75208 <NA> <NA> <NA> <NA> <NA> <NA>
## 92585 <NA> <NA> <NA> <NA> <NA> <NA>
## 96068 <NA> <NA> <NA> <NA> <NA> <NA>
## cancellation_policy city city_name cleaning_fee country country_code
## 16246 <NA> <NA> <NA> <NA> <NA> <NA>
## 30584 <NA> <NA> <NA> <NA> <NA> <NA>
## 47615 <NA> <NA> <NA> <NA> <NA> <NA>
## 56281 <NA> <NA> <NA> <NA> <NA> <NA>
## 65792 <NA> <NA> <NA> <NA> <NA> <NA>
## 72540 <NA> <NA> <NA> <NA> <NA> <NA>
## 75208 <NA> <NA> <NA> <NA> <NA> <NA>
## 92585 <NA> <NA> <NA> <NA> <NA> <NA>
## 96068 <NA> <NA> <NA> <NA> <NA> <NA>
## description experiences_offered extra_people first_review guests_included
## 16246 <NA> <NA> <NA> <NA> NA
## 30584 <NA> <NA> <NA> <NA> NA
## 47615 <NA> <NA> <NA> <NA> NA
## 56281 <NA> <NA> <NA> <NA> NA
## 65792 <NA> <NA> <NA> <NA> NA
## 72540 <NA> <NA> <NA> <NA> NA
## 75208 <NA> <NA> <NA> <NA> NA
## 92585 <NA> <NA> <NA> <NA> NA
## 96068 <NA> <NA> <NA> <NA> NA
## host_about host_acceptance_rate host_has_profile_pic
## 16246 <NA> <NA> <NA>
## 30584 <NA> <NA> <NA>
## 47615 <NA> <NA> <NA>
## 56281 <NA> <NA> <NA>
## 65792 <NA> <NA> <NA>
## 72540 <NA> <NA> <NA>
## 75208 <NA> <NA> <NA>
## 92585 <NA> <NA> <NA>
## 96068 <NA> <NA> <NA>
## host_identity_verified host_is_superhost host_listings_count
## 16246 <NA> <NA> <NA>
## 30584 <NA> <NA> <NA>
## 47615 <NA> <NA> <NA>
## 56281 <NA> <NA> <NA>
## 65792 <NA> <NA> <NA>
## 72540 <NA> <NA> <NA>
## 75208 <NA> <NA> <NA>
## 92585 <NA> <NA> <NA>
## 96068 <NA> <NA> <NA>
## host_location host_name host_neighbourhood host_response_rate
## 16246 <NA> <NA> <NA> <NA>
## 30584 <NA> <NA> <NA> <NA>
## 47615 <NA> <NA> <NA> <NA>
## 56281 <NA> <NA> <NA> <NA>
## 65792 <NA> <NA> <NA> <NA>
## 72540 <NA> <NA> <NA> <NA>
## 75208 <NA> <NA> <NA> <NA>
## 92585 <NA> <NA> <NA> <NA>
## 96068 <NA> <NA> <NA> <NA>
## host_response_time host_since host_total_listings_count
## 16246 <NA> <NA> <NA>
## 30584 <NA> <NA> <NA>
## 47615 <NA> <NA> <NA>
## 56281 <NA> <NA> <NA>
## 65792 <NA> <NA> <NA>
## 72540 <NA> <NA> <NA>
## 75208 <NA> <NA> <NA>
## 92585 <NA> <NA> <NA>
## 96068 <NA> <NA> <NA>
## host_verifications house_rules instant_bookable interaction
## 16246 <NA> <NA> <NA> <NA>
## 30584 <NA> <NA> <NA> <NA>
## 47615 <NA> <NA> <NA> <NA>
## 56281 <NA> <NA> <NA> <NA>
## 65792 <NA> <NA> <NA> <NA>
## 72540 <NA> <NA> <NA> <NA>
## 75208 <NA> <NA> <NA> <NA>
## 92585 <NA> <NA> <NA> <NA>
## 96068 <NA> <NA> <NA> <NA>
## is_business_travel_ready is_location_exact jurisdiction_names latitude
## 16246 <NA> <NA> <NA> <NA>
## 30584 <NA> <NA> <NA> <NA>
## 47615 <NA> <NA> <NA> <NA>
## 56281 <NA> <NA> <NA> <NA>
## 65792 <NA> <NA> <NA> <NA>
## 72540 <NA> <NA> <NA> <NA>
## 75208 <NA> <NA> <NA> <NA>
## 92585 <NA> <NA> <NA> <NA>
## 96068 <NA> <NA> <NA> <NA>
## license longitude market maximum_nights minimum_nights monthly_price name
## 16246 <NA> <NA> <NA> NA NA <NA> <NA>
## 30584 <NA> <NA> <NA> NA NA <NA> <NA>
## 47615 <NA> <NA> <NA> NA NA <NA> <NA>
## 56281 <NA> <NA> <NA> NA NA <NA> <NA>
## 65792 <NA> <NA> <NA> NA NA <NA> <NA>
## 72540 <NA> <NA> <NA> NA NA <NA> <NA>
## 75208 <NA> <NA> <NA> NA NA <NA> <NA>
## 92585 <NA> <NA> <NA> NA NA <NA> <NA>
## 96068 <NA> <NA> <NA> NA NA <NA> <NA>
## neighborhood_overview neighbourhood notes price property_type
## 16246 <NA> <NA> <NA> <NA> <NA>
## 30584 <NA> <NA> <NA> <NA> <NA>
## 47615 <NA> <NA> <NA> <NA> <NA>
## 56281 <NA> <NA> <NA> <NA> <NA>
## 65792 <NA> <NA> <NA> <NA> <NA>
## 72540 <NA> <NA> <NA> <NA> <NA>
## 75208 <NA> <NA> <NA> <NA> <NA>
## 92585 <NA> <NA> <NA> <NA> <NA>
## 96068 <NA> <NA> <NA> <NA> <NA>
## require_guest_phone_verification require_guest_profile_picture
## 16246 <NA> <NA>
## 30584 <NA> <NA>
## 47615 <NA> <NA>
## 56281 <NA> <NA>
## 65792 <NA> <NA>
## 72540 <NA> <NA>
## 75208 <NA> <NA>
## 92585 <NA> <NA>
## 96068 <NA> <NA>
## requires_license room_type security_deposit smart_location space
## 16246 <NA> <NA> <NA> <NA> <NA>
## 30584 <NA> <NA> <NA> <NA> <NA>
## 47615 <NA> <NA> <NA> <NA> <NA>
## 56281 <NA> <NA> <NA> <NA> <NA>
## 65792 <NA> <NA> <NA> <NA> <NA>
## 72540 <NA> <NA> <NA> <NA> <NA>
## 75208 <NA> <NA> <NA> <NA> <NA>
## 92585 <NA> <NA> <NA> <NA> <NA>
## 96068 <NA> <NA> <NA> <NA> <NA>
## square_feet state street summary transit weekly_price zipcode
## 16246 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 30584 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 47615 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 56281 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 65792 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 72540 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 75208 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 92585 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 96068 NA <NA> <NA> <NA> <NA> <NA> <NA>
Now we can work on transforming the data -
# First, the columns which have prices -
# cleaning_fee, extra_people, price, security_deposit, weekly_price, monthly_price
df_x$cleaning_fee <- as.numeric(str_replace_all(df_x$cleaning_fee, "[\\$,]", ""))
# test data
tdf$cleaning_fee <- as.numeric(str_replace_all(tdf$cleaning_fee, "[\\$,]", ""))
## Warning: NAs introduced by coercion
df_x$extra_people <- as.numeric(str_replace_all(df_x$extra_people, "[\\$,]", ""))
# test data
tdf$extra_people <- as.numeric(str_replace_all(tdf$extra_people, "[\\$,]", ""))
df_x$price <- as.numeric(str_replace_all(df_x$price, "[\\$,]", ""))
# test data
tdf$price <- as.numeric(str_replace_all(tdf$price, "[\\$,]", ""))
df_x$security_deposit <- as.numeric(str_replace_all(df_x$security_deposit, "[\\$,]", ""))
# test data
tdf$security_deposit <- as.numeric(str_replace_all(tdf$security_deposit, "[\\$,]", ""))
df_x$weekly_price <- as.numeric(str_replace_all(df_x$weekly_price, "[\\$,]", ""))
# test data
tdf$weekly_price <- as.numeric(str_replace_all(tdf$weekly_price, "[\\$,]", ""))
df_x$monthly_price <- as.numeric(str_replace_all(df_x$monthly_price, "[\\$,]", ""))
# test data
tdf$monthly_price <- as.numeric(str_replace_all(tdf$monthly_price, "[\\$,]", ""))
# Next, the columns which havedf_x$host_acceptance_rate - has %
# host_acceptance_rate, host_response_rate
df_x$host_acceptance_rate <- as.numeric(str_replace_all(df_x$host_acceptance_rate, "%", ""))
# test data
tdf$host_acceptance_rate <- as.numeric(str_replace_all(tdf$host_acceptance_rate, "%", ""))
df_x$host_response_rate <- as.numeric(str_replace_all(df_x$host_response_rate, "%", ""))
# test data
tdf$host_response_rate <- as.numeric(str_replace_all(tdf$host_response_rate, "%", ""))
## Warning: NAs introduced by coercion
The other columns now -
1 accommodates Factor 2 availability_30 Factor 3 availability_365 int 4 availability_60 Factor 5 availability_90 Factor 6 bathrooms Factor 7 bedrooms Factor 8 beds Factor # 9 cleaning_fee Factor - (needs to be cleaned - has $ symbol like $250.00) # 10 extra_people Factor - (needs to be cleaned - has $ symbol like $250.00) 11 guests_included num # 12 host_acceptance_rate Factor - (needs to be cleaned - has format like so - 100%) 13 host_listings_count Factor # 14 host_response_rate Factor - (needs to be cleaned - has format like so - 100%) 15 host_total_listings_count Factor 16 latitude Factor 17 longitude Factor 18 maximum_nights num 19 minimum_nights int # 20 monthly_price Factor - (needs to be cleaned - has $ symbol like $250.00) # 21 price Factor - (needs to be cleaned - has $ symbol like $250.00) # 22 security_deposit Factor - (needs to be cleaned - has $ symbol like $250.00) 23 square_feet int # 24 weekly_price Factor - (needs to be cleaned - has $ symbol like $250.00)
qplot(as.numeric(accommodates), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
qplot(as.numeric(availability_30), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
qplot(as.numeric(availability_365), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
qplot(as.numeric(availability_60), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
qplot(as.numeric(availability_90), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
qplot(as.numeric(bathrooms), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 254 rows containing non-finite values (stat_bin).
qplot(as.numeric(bedrooms), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 101 rows containing non-finite values (stat_bin).
qplot(as.numeric(beds), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 92 rows containing non-finite values (stat_bin).
qplot(as.numeric(guests_included), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
qplot(as.numeric(host_listings_count), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).
qplot(as.numeric(host_total_listings_count), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).
# qplot(longitude, data = df_x)
# qplot(latitude, data = df_x)
qplot(as.numeric(maximum_nights), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).
qplot(as.numeric(minimum_nights), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).
qplot(as.numeric(square_feet), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 98419 rows containing non-finite values (stat_bin).
Visualize in log scale -
qplot(as.numeric(accommodates), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
qplot(as.numeric(availability_30), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
qplot(as.numeric(availability_365), data = df_x) + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 16728 rows containing non-finite values (stat_bin).
qplot(as.numeric(availability_60), data = df_x) + coord_flip() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
qplot(as.numeric(availability_90), data = df_x) + coord_flip() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
qplot(as.numeric(bathrooms), data = df_x) + coord_flip() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 254 rows containing non-finite values (stat_bin).
qplot(as.numeric(bedrooms), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 101 rows containing non-finite values (stat_bin).
qplot(as.numeric(beds), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 92 rows containing non-finite values (stat_bin).
qplot(as.numeric(guests_included), data = df_x) + coord_flip() + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 365 rows containing non-finite values (stat_bin).
qplot(as.numeric(host_listings_count), data = df_x) + coord_flip() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).
qplot(as.numeric(host_total_listings_count), data = df_x) + coord_flip() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).
# qplot(longitude, data = df_x)
# qplot(latitude, data = df_x)
qplot(as.numeric(maximum_nights), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).
qplot(as.numeric(minimum_nights), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).
qplot(as.numeric(square_feet), data = df_x) + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 98536 rows containing non-finite values (stat_bin).
Convert these columns to numeric
num_cols <- c("accommodates", "availability_30", "availability_365", "availability_60",
"availability_90", "bathrooms", "bedrooms", "beds", "guests_included",
"host_listings_count", "host_total_listings_count",
"maximum_nights", "minimum_nights", "square_feet")
geo_cols <- c("latitude", "longitude")
df_x[,num_cols] <- sapply(df_x[,num_cols],as.numeric)
# test data
tdf[,num_cols] <- sapply(tdf[,num_cols],as.numeric)
df_x[,geo_cols] <- sapply(df_x[,geo_cols],as.character)
df_x[,geo_cols] <- sapply(df_x[,geo_cols],as.numeric)
# test data
tdf[,geo_cols] <- sapply(tdf[,geo_cols],as.character)
tdf[,geo_cols] <- sapply(tdf[,geo_cols],as.numeric)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
Observations for these variables -
ggplot(gather(df_x[, num_cols]), aes(value)) +
geom_histogram(stat = "count") +
facet_wrap(~key, scales = 'free_x')
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 99260 rows containing non-finite values (stat_count).
Now a look at categorical variables -
8 host_has_profile_pic Factor 9 host_identity_verified Factor 10 host_is_superhost Factor # 11 host_location Factor # 12 host_neighbourhood Factor # 13 host_response_time Factor 14 instant_bookable Factor 15 is_business_travel_ready Factor 16 is_location_exact Factor # 17 jurisdiction_names Factor # 18 license Factor # 19 market Factor # 20 neighbourhood Factor # 21 property_type Factor 22 require_guest_phone_verification Factor 23 require_guest_profile_picture Factor 24 requires_license Factor # 25 room_type Factor # 26 smart_location Factor # 27 state Factor
There are a bunch of columns which have ‘t’ and ‘f’
First, histograms.
These columns seem to comprise mostly of ’f’s and ’t’s.
We need to figure out how to deal with this specific column since more than 45% of the values are missing. We’ll get to that but first, let’s take care of the other columns.
# View(table(tdf$host_has_profile_pic))
# View(table(tdf$host_identity_verified))
# View(table(tdf$host_is_superhost))
# View(table(tdf$instant_bookable))
# View(table(tdf$is_business_travel_ready))
# View(table(tdf$is_location_exact))
# View(table(tdf$require_guest_phone_verification))
# View(table(tdf$require_guest_profile_picture))
# View(table(tdf$requires_license))
Let’s fix the discrepancies in the test data
superhost_discrepancies <- c("Bed,Bath&Bike in Sunny Santa Monica", "Pristine Mid-Century Modern w 180° Canyon View!")
misplacedy <- c(775, 10274)
misplaced_tdf <- tdf[tdf$host_is_superhost %in% superhost_discrepancies, ]
tdf[tdf$host_is_superhost %in% superhost_discrepancies, 3:70] <- NA
tdf[tdf$X %in% misplacedy, ]
## X access accommodates amenities availability_30 availability_365
## 775 775 <NA> NA <NA> NA NA
## 10274 10274 <NA> NA <NA> NA NA
## availability_60 availability_90 bathrooms bed_type bedrooms beds
## 775 NA NA NA <NA> NA NA
## 10274 NA NA NA <NA> NA NA
## cancellation_policy city city_name cleaning_fee country country_code
## 775 <NA> <NA> <NA> NA <NA> <NA>
## 10274 <NA> <NA> <NA> NA <NA> <NA>
## description experiences_offered extra_people first_review guests_included
## 775 <NA> <NA> NA <NA> NA
## 10274 <NA> <NA> NA <NA> NA
## host_about host_acceptance_rate host_has_profile_pic
## 775 <NA> NA <NA>
## 10274 <NA> NA <NA>
## host_identity_verified host_is_superhost host_listings_count
## 775 <NA> <NA> NA
## 10274 <NA> <NA> NA
## host_location host_name host_neighbourhood host_response_rate
## 775 <NA> <NA> <NA> NA
## 10274 <NA> <NA> <NA> NA
## host_response_time host_since host_total_listings_count
## 775 <NA> <NA> NA
## 10274 <NA> <NA> NA
## host_verifications house_rules instant_bookable interaction
## 775 <NA> <NA> <NA> <NA>
## 10274 <NA> <NA> <NA> <NA>
## is_business_travel_ready is_location_exact jurisdiction_names latitude
## 775 <NA> <NA> <NA> NA
## 10274 <NA> <NA> <NA> NA
## license longitude market maximum_nights minimum_nights monthly_price name
## 775 <NA> NA <NA> NA NA NA <NA>
## 10274 <NA> NA <NA> NA NA NA <NA>
## neighborhood_overview neighbourhood notes price property_type
## 775 <NA> <NA> <NA> NA <NA>
## 10274 <NA> <NA> <NA> NA <NA>
## require_guest_phone_verification require_guest_profile_picture
## 775 <NA> <NA>
## 10274 <NA> <NA>
## requires_license room_type security_deposit smart_location space
## 775 <NA> <NA> NA <NA> <NA>
## 10274 <NA> <NA> NA <NA> <NA>
## square_feet state street summary transit weekly_price zipcode
## 775 NA <NA> <NA> <NA> <NA> NA NA
## 10274 NA <NA> <NA> <NA> <NA> NA NA
Convert t’s and f’s to 1s and 0s
# 1. df$host_has_profile_pic - factor
df_x$host_has_profile_pic <- ifelse(df_x_main$host_has_profile_pic == "t", 1, 0)
qplot(x = host_has_profile_pic, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 142 rows containing non-finite values (stat_bin).
# test data
tdf$host_has_profile_pic <- ifelse(tdf_x_main$host_has_profile_pic == "t", 1, 0)
# 2. df$host_identity_verified - factor
df_x$host_identity_verified <- ifelse(df_x_main$host_identity_verified == "t", 1, 0)
qplot(x = host_identity_verified, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 148 rows containing non-finite values (stat_bin).
# test data
tdf$host_identity_verified <- ifelse(tdf_x_main$host_identity_verified == "t", 1, 0)
# 3. df$host_is_superhost - factor
df_x$host_is_superhost <- ifelse(df_x_main$host_is_superhost== "t", 1, 0)
qplot(x = host_is_superhost, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 142 rows containing non-finite values (stat_bin).
# test data
tdf$host_is_superhost <- ifelse(tdf_x_main$host_is_superhost== "t", 1, 0)
# 4. df$instant_bookable - factor
df_x$instant_bookable <- ifelse(df_x_main$instant_bookable == "t", 1, 0)
qplot(x = instant_bookable, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 12 rows containing non-finite values (stat_bin).
# test data
tdf$instant_bookable <- ifelse(tdf_x_main$instant_bookable == "t", 1, 0)
# 5. df$is_business_travel_ready - factor
df_x$is_business_travel_ready <- ifelse(is.na(df_x_main$is_business_travel_ready), df_x_main$is_business_travel_ready, ifelse(df_x_main$is_business_travel_ready == "t", 1, 0))
qplot(x = is_business_travel_ready, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 44551 rows containing non-finite values (stat_bin).
# test data
tdf$is_business_travel_ready <- ifelse(is.na(tdf_x_main$is_business_travel_ready), tdf_x_main$is_business_travel_ready, ifelse(tdf_x_main$is_business_travel_ready == "t", 1, 0))
# 6. df$is_location_exact - factor
df_x$is_location_exact <- ifelse(df_x_main$is_location_exact == "t", 1, 0)
qplot(x = is_location_exact, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).
# test data
tdf$is_location_exact <- ifelse(tdf_x_main$is_location_exact == "t", 1, 0)
# 7. df$require_guest_phone_verification - factor
df_x$require_guest_phone_verification <- ifelse(df_x_main$require_guest_phone_verification == "t", 1, 0)
qplot(x = require_guest_phone_verification, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).
# test data
tdf$require_guest_phone_verification <- ifelse(tdf_x_main$require_guest_phone_verification == "t", 1, 0)
# 8. df$require_guest_profile_picture - factor
df_x$require_guest_profile_picture <- ifelse(df_x_main$require_guest_profile_picture == "t", 1, 0)
qplot(x = require_guest_profile_picture, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).
# test data
tdf$require_guest_profile_picture <- ifelse(tdf_x_main$require_guest_profile_picture == "t", 1, 0)
# 9. df$requires_license - factor
df_x$requires_license <- ifelse(df_x_main$requires_license == "t", 1, 0)
qplot(x = requires_license, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).
# test data
tdf$requires_license <- ifelse(tdf_x_main$requires_license == "t", 1, 0)
Moving on to other categorical variables -
1 bed_type Factor 2 cancellation_policy Factor 3 city Factor 4 city_name Factor 5 country Factor 6 country_code Factor 7 experiences_offered Factor # 8 host_has_profile_pic Factor # 9 host_identity_verified Factor # 10 host_is_superhost Factor 11 host_location Factor 12 host_neighbourhood Factor 13 host_response_time Factor # 14 instant_bookable Factor # 15 is_business_travel_ready Factor # 16 is_location_exact Factor 17 jurisdiction_names Factor 18 license Factor 19 market Factor 20 neighbourhood Factor 21 property_type Factor # 22 require_guest_phone_verification Factor # 23 require_guest_profile_picture Factor # 24 requires_license Factor 25 room_type Factor 26 smart_location Factor 27 state Factor
qplot(df_x$bed_type) + coord_flip()
qplot(df_x$cancellation_policy) + coord_flip()
qplot(df_x$city) + coord_flip() # unclear - need to look at the table
qplot(df_x$city_name) + coord_flip()
qplot(df_x$country)+ coord_flip()
qplot(df_x$country_code)+ coord_flip()
qplot(df_x$experiences_offered)+ coord_flip()
qplot(df_x$host_location)+ coord_flip() # Unclear - need to look at the table
qplot(df_x$host_neighbourhood)+ coord_flip() # Unclear - need to look at the table
qplot(df_x$host_response_time)+ coord_flip()
qplot(df_x$jurisdiction_names)+ coord_flip() # Categorical list
qplot(df_x$license)+ coord_flip() # Unclear - need to look at the table
qplot(df_x$market)+ coord_flip() # Looks clean for the most part
qplot(df_x$neighbourhood)+ coord_flip() # Unclear - need to look at the table
qplot(df_x$property_type)+ coord_flip() # Looks neat for the most part
qplot(df_x$room_type)+ coord_flip() # clean
qplot(df_x$smart_location)+ coord_flip() # Unclear - need to look at the table
qplot(df_x$state)+ coord_flip() # needs to be cleaned
Table views for the variables with unclear plots -
# View(table(df_x$city)) # 738 categories
# View(table(df_x$host_location)) # 2154 categories; Format like so - Liverpool, England, United Kingdom
# View(table(df_x$host_neighbourhood)) # 1303 categories
# View(table(df_x$license)) # Looks like some unique ID - 7474 many categories
# View(table(df_x$neighbourhood)) # 1093 categories; Examples - Woodside, Windsor
# View(table(df_x$smart_location)) # 750 categories - Can be cleaned by converting all to upper case
First, let’s clean the state column -
library(rapportools) # for trim.space
## Warning: package 'rapportools' was built under R version 3.6.3
## Loading required package: reshape
## Warning: package 'reshape' was built under R version 3.6.3
##
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
##
## stamp
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
## The following object is masked from 'package:dplyr':
##
## rename
##
## Attaching package: 'rapportools'
## The following object is masked from 'package:dplyr':
##
## n
## The following objects are masked from 'package:stats':
##
## IQR, median, sd, var
## The following objects are masked from 'package:base':
##
## max, mean, min, range, sum
# View with all values in upper case
View(table(str_to_upper(df_x$state)))
# The data discrepancies are -
# 1. BAJA CALIFORNIA
# 2. NEW YORK
# 3. SECC TERRAZAS
# 4. MP
# Let's change that
df_x$state <- trim.space(str_to_upper(df_x$state))
df_x$state[df_x$state == 'NEW YORK'] <- 'NY'
df_x$state[df_x$state == 'BAJA CALIFORNIA'] <- 'CA'
df_x$state[df_x$state == 'SECC TERRAZAS'] <- 'CA'
df_x$state[df_x$state == 'MP'] <- 'NY'
# There are only 17 samples for MD which can be changed to DC
# There are only 2 samples for NJ which can be changed to NY
df_x$state[df_x$state == 'MD'] <- 'DC'
df_x$state[df_x$state == 'NJ'] <- 'NY'
# There is a separate column for LA which can be changed to CA
df_x$state[df_x$state == 'LA'] <- 'CA'
unique(df_x$state)
## [1] "TN" "CA" "DC" "NY" "WA" "IL" "MA" "TX" "OR" "CO" "NC" NA
# test data
tdf$state <- trim.space(str_to_upper(tdf$state))
View(table(tdf$state))
tdf$state[tdf$state == 'LA'] <- 'CA'
tdf$state[tdf$state == 'MD'] <- 'DC'
unique(tdf$state)
## [1] "NY" "CA" "OR" "TX" "CO" "IL" "TN" "DC" "WA" "MA" "NC" NA
# View(sort(table(tdf$city), decreasing = T))
# View(table(tdf$host_location))
# View(table(tdf$host_neighbourhood))
# View(table(tdf$license))
# View(table(tdf$neighbourhood))
# View(table(tdf$smart_location))
They are pretty much okay.
Let’s capitalize and trim these categorical columns -
# city
df_x$city <- trim.space(str_to_upper(df_x$city))
# test data
tdf$city <- trim.space(str_to_upper(tdf$city))
# city_name
df_x$city_name <- trim.space(str_to_upper(df_x$city_name))
# test data
tdf$city_name <- trim.space(str_to_upper(tdf$city_name))
# host_location
df_x$host_location <- trim.space(str_to_upper(df_x$host_location))
# test data
tdf$host_location <- trim.space(str_to_upper(tdf$host_location))
# host_neighborhood
df_x$host_neighbourhood <- trim.space(str_to_upper(df_x$host_neighbourhood))
# test data
tdf$host_neighbourhood <- trim.space(str_to_upper(tdf$host_neighbourhood))
# neighborhood
df_x$neighbourhood <- trim.space(str_to_upper(df_x$neighbourhood))
# test data
tdf$neighbourhood <- trim.space(str_to_upper(tdf$neighbourhood))
# smart_location
df_x$smart_location <- trim.space(str_to_upper(df_x$smart_location))
# test data
tdf$smart_location <- trim.space(str_to_upper(tdf$smart_location))
Now let’s view the number of categories for each of these variables and see if they have reduced -
length(unique(df_x_main$city)) # 739
## [1] 739
length(unique(df_x$city)) # 526
## [1] 526
length(unique(df_x_main$host_location)) # 2155
## [1] 2155
length(unique(df_x$host_location)) # 2118
## [1] 2118
length(unique(df_x_main$host_neighbourhood)) # 1304
## [1] 1304
length(unique(df_x$host_neighbourhood)) # 1295
## [1] 1295
length(unique(df_x_main$neighborhood)) # 62392
## [1] 62392
length(unique(df_x$neighborhood)) # 62392
## [1] 62392
length(unique(df_x_main$smart_location)) # 751
## [1] 751
length(unique(df_x$smart_location)) # 634
## [1] 634
The test data -
length(unique(tdf_x_main$city)) # 265
## [1] 265
length(unique(tdf$city)) # 221
## [1] 221
length(unique(tdf_x_main$host_location)) # 600
## [1] 600
length(unique(tdf$host_location)) # 593
## [1] 593
length(unique(tdf_x_main$host_neighbourhood)) # 879
## [1] 879
length(unique(tdf$host_neighbourhood)) # 877
## [1] 877
length(unique(tdf_x_main$neighborhood)) # 8194
## [1] 8194
length(unique(tdf$neighborhood)) # 8194
## [1] 8194
length(unique(tdf_x_main$smart_location)) # 268
## [1] 268
length(unique(tdf$smart_location)) # 250
## [1] 250
Now, the date columns; Let’s convert them to date type; And then extract the number of years of experience the listing’s host has had as a host; We shall also find the number of years since becoming a host did it take for them to get their first review -
df_x <- df_x %>%
mutate(first_review = ymd(first_review),
host_since = ymd(host_since),
host_age = 2020 - year(host_since),
start_to_review_gap = year(first_review) - year(host_since))
# test data
tdf <- tdf %>%
mutate(first_review = ymd(first_review),
host_since = ymd(host_since),
host_age = 2020 - year(host_since),
start_to_review_gap = year(first_review) - year(host_since))
View the distribution of the host_age and start_to_review_gap -
qplot(df_x$host_since, df_x$first_review)
## Warning: Removed 151 rows containing missing values (geom_point).
qplot(df_x$host_age)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).
qplot(df_x$start_to_review_gap)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).
Replace the NAs in start_to_review_gap and host_age with the respective means of the column
library(tidyr)
sTor_gap <- mean(df_x$start_to_review_gap)
df_x$start_to_review_gap <- replace_na(df_x$start_to_review_gap, sTor_gap)
# test data
tdf$start_to_review_gap <- replace_na(tdf$start_to_review_gap, sTor_gap)
host_age_mean <- mean(df_x$host_age)
df_x$host_age <- replace_na(df_x$host_age, host_age_mean)
# test data
tdf$host_age <- replace_na(tdf$start_to_review_gap, host_age_mean)
There are some start_to_review_gaps which are lesser than zero (where the first_review date is earlier than the host_since date)
Let’s clean it up
df_x$start_to_review_gap[df_x$start_to_review_gap < 0] <- sTor_gap
# test data
tdf$start_to_review_gap[tdf$start_to_review_gap < 0] <- sTor_gap
Let’s bucket these variables based on the quantiles -
# host_age / experience
df_x <- df_x %>%
mutate(host_age_q = ntile(host_age, 5))
# df_x %>%
# mutate(host_age_q = ntile(host_age, 5)) %>%
# group_by(host_age_q, host_age) %>%
# summarize(n = n()) %>%
# group_by(host_age_q) %>%
# summarize(total = sum(n),
# min_age = min(host_age),
# max_age = max(host_age))
# 1 20000 2 4
# 2 20000 4 5
# 3 20000 5 6
# 4 20000 6 8
# 5 20000 8 12
df_x <- df_x %>%
mutate(
host_lowExp = ifelse(host_age_q == 1, 1, 0),
host_lowMedExp = ifelse(host_age_q == 2, 1, 0),
host_medExp = ifelse(host_age_q == 3, 1, 0),
host_medHighExp = ifelse(host_age_q == 4, 1, 0),
host_highExp = ifelse(host_age_q == 5, 1, 0)
)
tdf <- tdf %>%
mutate(
host_lowExp = ifelse((host_age >= 2) & (host_age < 4), 1, 0),
host_lowMedExp = ifelse((host_age >= 4) & (host_age < 5), 1, 0),
host_medExp = ifelse((host_age >= 5) & (host_age < 6), 1, 0),
host_medHighExp = ifelse((host_age >= 6) & (host_age < 8), 1, 0),
host_highExp = ifelse((host_age >= 8) & (host_age < 12), 1, 0)
)
Now the start_to_review_gap -
Let’s first check the outliers to impute -
sTorR_OutVals <- boxplot(df_x$start_to_review_gap)$out
sTorR_OutVals
## [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 8 8 8
## [38] 8 8 9 8 8 8 8 8 8 8 8 8 8 9 9 8 9 8 8 8 8 8 8 8 8 8 8 9 8 8 9 9 8 8 8 8 9
## [75] 8 8 8 8 8 8 9 8 8 8 9 8 8 9 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 8 8 8 8 8
## [112] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [149] 8 8 9 8 8 8 9 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 8 8 8 8 8 8 8 9 9
sTorR_TestOutVals <- boxplot(tdf$start_to_review_gap)$out
sTorR_TestOutVals
## [1] 8 8 8 8 8 8 8 8 8 9 8 8 8 8 8 8 8 8 8 8 9 8
Change the outliers -
df_x$start_to_review_gap[df_x$start_to_review_gap > 7] <- 6
tdf$start_to_review_gap[tdf$start_to_review_gap > 7] <- 6
df_x <- df_x %>%
mutate(sTor_gap_q = ntile(start_to_review_gap, 3))
# df_x %>%
# mutate(sTor_gap_q = ntile(start_to_review_gap, 3)) %>%
# group_by(sTor_gap_q, start_to_review_gap) %>%
# summarize(n = n()) %>%
# group_by(sTor_gap_q) %>%
# summarize(total = sum(n),
# min_age = min(start_to_review_gap),
# max_age = max(start_to_review_gap))
# 1 33334 0 1
# 2 33333 1 2
# 3 33333 2 7
df_x <- df_x %>%
mutate(
NoGap_sTor = ifelse(sTor_gap_q == 1, 1, 0),
Yr1Gap_sTor = ifelse(sTor_gap_q == 2, 1, 0),
HighGap_sTor = ifelse(sTor_gap_q == 3, 1, 0)
)
tdf <- tdf %>%
mutate(
NoGap_sTor = ifelse(start_to_review_gap == 0, 1, 0),
Yr1Gap_sTor = ifelse(start_to_review_gap == 1, 1, 0),
HighGap_sTor = ifelse(start_to_review_gap >= 2, 1, 0)
)
Dummy variables for state -
df_x <- df_x %>%
mutate(
In_CA = ifelse(state == 'CA', 1, 0),
In_CO = ifelse(state == 'CO', 1, 0),
In_DC = ifelse(state == 'DC', 1, 0),
In_IL = ifelse(state == 'IL', 1, 0),
In_MA = ifelse(state == 'MA', 1, 0),
In_NC = ifelse(state == 'NC', 1, 0),
In_NY = ifelse(state == 'NY', 1, 0),
In_OR = ifelse(state == 'OR', 1, 0),
In_TN = ifelse(state == 'TN', 1, 0),
In_TX = ifelse(state == 'TX', 1, 0),
In_WA = ifelse(state == 'WA', 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
In_CA = ifelse(state == 'CA', 1, 0),
In_CO = ifelse(state == 'CO', 1, 0),
In_DC = ifelse(state == 'DC', 1, 0),
In_IL = ifelse(state == 'IL', 1, 0),
In_MA = ifelse(state == 'MA', 1, 0),
In_NC = ifelse(state == 'NC', 1, 0),
In_NY = ifelse(state == 'NY', 1, 0),
In_OR = ifelse(state == 'OR', 1, 0),
In_TN = ifelse(state == 'TN', 1, 0),
In_TX = ifelse(state == 'TX', 1, 0),
In_WA = ifelse(state == 'WA', 1, 0)
)
Let’s also include city_name dummies. There might be more variance with these because parameters might change based on cities rather than state on the whole -
unique(df_x$city_name)
## [1] "NASHVILLE" "LOS ANGELES" "SAN DIEGO" "WASHINGTON DC"
## [5] "NEW ORLEANS" "SAN FRANCISCO" "NEW YORK" "SEATTLE"
## [9] "CHICAGO" "BOSTON" "AUSTIN" "PORTLAND"
## [13] "DENVER" "SANTA CRUZ" "OAKLAND" "ASHEVILLE"
## [17] NA
unique(tdf$city_name)
## [1] "NEW YORK" "LOS ANGELES" "NEW ORLEANS" "PORTLAND"
## [5] "AUSTIN" "SAN DIEGO" "DENVER" "CHICAGO"
## [9] "OAKLAND" "NASHVILLE" "WASHINGTON DC" "SEATTLE"
## [13] "SAN FRANCISCO" "BOSTON" "ASHEVILLE" "SANTA CRUZ"
## [17] NA
Let’s create dummies then,
df_x <- df_x %>%
mutate(
In_NAS = ifelse(city_name == 'NASHVILLE', 1, 0),
In_LA = ifelse(city_name == 'LOS ANGELES', 1, 0),
In_SD = ifelse(city_name == 'SAN DIEGO', 1, 0),
In_WDC = ifelse(city_name == 'WASHINGTON DC', 1, 0),
In_NWO = ifelse(city_name == 'NEW ORLEANS', 1, 0),
In_SFO = ifelse(city_name == 'SAN FRANCISCO', 1, 0),
In_NYC = ifelse(city_name == 'NEW YORK', 1, 0),
In_SEA = ifelse(city_name == 'SEATTLE', 1, 0),
In_CHI = ifelse(city_name == 'CHICAGO', 1, 0),
In_BOS = ifelse(city_name == 'BOSTON', 1, 0),
In_AUS = ifelse(city_name == 'AUSTIN', 1, 0),
In_PO = ifelse(city_name == 'PORTLAND', 1, 0),
In_DEN = ifelse(city_name == 'DENVER', 1, 0),
In_SCU = ifelse(city_name == 'SANTA CRUZ', 1, 0),
In_OAK = ifelse(city_name == 'OAKLAND', 1, 0),
In_ASH = ifelse(city_name == 'ASHEVILLE', 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
In_NAS = ifelse(city_name == 'NASHVILLE', 1, 0),
In_LA = ifelse(city_name == 'LOS ANGELES', 1, 0),
In_SD = ifelse(city_name == 'SAN DIEGO', 1, 0),
In_WDC = ifelse(city_name == 'WASHINGTON DC', 1, 0),
In_NWO = ifelse(city_name == 'NEW ORLEANS', 1, 0),
In_SFO = ifelse(city_name == 'SAN FRANCISCO', 1, 0),
In_NYC = ifelse(city_name == 'NEW YORK', 1, 0),
In_SEA = ifelse(city_name == 'SEATTLE', 1, 0),
In_CHI = ifelse(city_name == 'CHICAGO', 1, 0),
In_BOS = ifelse(city_name == 'BOSTON', 1, 0),
In_AUS = ifelse(city_name == 'AUSTIN', 1, 0),
In_PO = ifelse(city_name == 'PORTLAND', 1, 0),
In_DEN = ifelse(city_name == 'DENVER', 1, 0),
In_SCU = ifelse(city_name == 'SANTA CRUZ', 1, 0),
In_OAK = ifelse(city_name == 'OAKLAND', 1, 0),
In_ASH = ifelse(city_name == 'ASHEVILLE', 1, 0)
)
Cancellation Policy -
# View(table(df_x$cancellation_policy))
# 7 strict 47372
# 5 moderate 30411
# 4 flexible 21837
# 8 super_strict_30 261
# 9 super_strict_60 106
# 6 no_refunds 4
strict <- c("no_refunds", "super_strict_60", "super_strict_30", "strict")
df_x <- df_x %>%
mutate(
CancelPol_Strict = ifelse(cancellation_policy %in% strict, 1, 0),
CancelPol_Mod = ifelse(cancellation_policy == 'moderate', 1, 0),
CancelPol_Flexy = ifelse(cancellation_policy == 'flexible', 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
CancelPol_Strict = ifelse(cancellation_policy %in% strict, 1, 0),
CancelPol_Mod = ifelse(cancellation_policy == 'moderate', 1, 0),
CancelPol_Flexy = ifelse(cancellation_policy == 'flexible', 1, 0)
)
Room Type and Property Type -
# View(table(df_x$room_type))
# 1 Entire home/apt 60892
# 2 Private room 36617
# 3 Shared room 2472
# View(table(tdf$room_type))
# 1 Entire home/apt 7425
# 2 Private room 4480
# 3 Shared room 300
df_x <- df_x %>%
mutate(
WholePlace = ifelse(room_type == 'Entire home/apt', 1, 0),
PrivateRoom = ifelse(room_type == 'Private room', 1, 0),
SharedRoom = ifelse(room_type == 'Shared room', 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
WholePlace = ifelse(room_type == 'Entire home/apt', 1, 0),
PrivateRoom = ifelse(room_type == 'Private room', 1, 0),
SharedRoom = ifelse(room_type == 'Shared room', 1, 0)
)
Property type -
df_x$property_type <- trim.space(str_to_upper(df_x$property_type))
tdf$property_type <- trim.space(str_to_upper(tdf$property_type))
# View(table(df_x$property_type))
apartment_types = c("APARTMENT", "CONDOMINIUM", "LOFT", "IN-LAW")
house_types = c("HOUSE", "TOWNHOUSE", "GUESTHOUSE", "TINY HOUSE", "BUNGALOW", "VILLA", "CASA PARTICULAR (CUBA)")
hotel_types = c("BED & BREAKFAST", "BED AND BREAKFAST", "SERVICED APARTMENT", "HOTEL", "APARTHOTEL", "GUEST SUITE")
holiday_types = c("RESORT", "VACATION HOME", "CABIN", "CHALET", "TIMESHARE", "BOAT", "TREEHOUSE", "YURT", "COTTAGE", "HUT", "ISLAND", "BOUTIQUE HOTEL", "CASTLE", "EARTH HOUSE", "CAVE", "TRAIN", "FARM STAY", "BARN", "LIGHTHOUSE", "NATURE LODGE", "PLANE")
other_types = c("OTHER", "DORM", "CAMPER/RV", "HOSTEL", "TENT", "TIPI")
df_x <- df_x %>%
mutate(
propertyApartment = ifelse(property_type %in% apartment_types, 1, 0),
propertyHouse = ifelse(property_type %in% house_types, 1, 0),
propertyHotel = ifelse(property_type %in% hotel_types, 1, 0),
propertyHoliday = ifelse(property_type %in% holiday_types, 1, 0),
propertyOther = ifelse(property_type %in% other_types, 1, 0)
)
# test data
# View(table(tdf$property_type))
tdf <- tdf %>%
mutate(
propertyApartment = ifelse(property_type %in% apartment_types, 1, 0),
propertyHouse = ifelse(property_type %in% house_types, 1, 0),
propertyHotel = ifelse(property_type %in% hotel_types, 1, 0),
propertyHoliday = ifelse(property_type %in% holiday_types, 1, 0),
propertyOther = ifelse(property_type %in% other_types, 1, 0)
)
Bed Type -
# View(table(df_x$bed_type))
# View(table(tdf$bed_type))
# View(table(df_x$bed_type, df_y$high_booking_rate))
# 3 Airbed 0 449
# 4 Couch 0 213
# 5 Futon 0 805
# 6 Pull-out Sofa 0 556
# 7 Real Bed 0 72828
#
# 10 Airbed 1 89
# 11 Couch 1 48
# 12 Futon 1 191
# 13 Pull-out Sofa 1 164
# 14 Real Bed 1 24638
uncomfyBeds = c("Airbed", "Couch", "Futon", "Pull-out Sofa")
df_x <- df_x %>%
mutate(
bed_Real = ifelse(bed_type == 'Real Bed', 1, 0),
bed_Uncomfy = ifelse(bed_type %in% uncomfyBeds, 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
bed_Real = ifelse(bed_type == 'Real Bed', 1, 0),
bed_Uncomfy = ifelse(bed_type %in% uncomfyBeds, 1, 0)
)
host_response_time
# View(table(df_x$host_response_time, df_y$high_booking_rate))
# View(table(df_x$host_response_time))
# View(table(tdf$host_response_time))
# 1 a few days or more 0 1350
# 2 f 0 0
# 3 within a day 0 9800
# 4 within a few hours 0 14333
# 5 within an hour 0 33865
# 6 a few days or more 1 55
# 7 f 1 0
# 8 within a day 1 857
# 9 within a few hours 1 2929
# 10 within an hour 1 21008
# replacing na with the second most popular category
df_x$host_response_time <- replace_na(df_x$host_response_time, 'within a few hours')
df_x <- df_x %>%
mutate(
hRes_FewDays = ifelse(host_response_time == 'a few days or more', 1, 0),
hRes_1Day = ifelse(host_response_time == 'within a day', 1, 0),
hRes_FewHours = ifelse(host_response_time == 'within a few hours', 1, 0),
hRes_1Hour = ifelse(host_response_time == 'within an hour', 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
hRes_FewDays = ifelse(host_response_time == 'a few days or more', 1, 0),
hRes_1Day = ifelse(host_response_time == 'within a day', 1, 0),
hRes_FewHours = ifelse(host_response_time == 'within a few hours', 1, 0),
hRes_1Hour = ifelse(host_response_time == 'within an hour', 1, 0)
)
View(df_x$host_verifications)
View(df_x$amenities)
Some text mining on the columns with categorical lists -
library(qdap)
## Warning: package 'qdap' was built under R version 3.6.3
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Warning: package 'qdapRegex' was built under R version 3.6.3
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:dplyr':
##
## explain
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
## Warning: package 'qdapTools' was built under R version 3.6.3
##
## Attaching package: 'qdapTools'
## The following object is masked from 'package:dplyr':
##
## id
## Loading required package: RColorBrewer
##
## Attaching package: 'qdap'
## The following object is masked from 'package:reshape':
##
## condense
## The following object is masked from 'package:forcats':
##
## %>%
## The following object is masked from 'package:stringr':
##
## %>%
## The following object is masked from 'package:purrr':
##
## %>%
## The following object is masked from 'package:tidyr':
##
## %>%
## The following object is masked from 'package:tibble':
##
## %>%
## The following object is masked from 'package:dplyr':
##
## %>%
## The following object is masked from 'package:base':
##
## Filter
df_x$host_verifications <- as.character(df_x$host_verifications)
df_x$host_verifications <- trim.space(str_to_lower(df_x$host_verifications))
freq_verifs <- freq_terms(df_x$host_verifications, 50)
plot(freq_verifs)
Dummies for number of host verifications
df_x <- df_x %>%
mutate(
No_of_Verifs = str_count(df_x$host_verifications, ",") + 1
)
# View(table(df_x$No_of_Verifs))
# 1 1 241
# 2 2 1978
# 3 3 16077
# 4 4 35876
# 5 5 23967
# 6 6 10773
# 7 7 4552
# 8 8 3522
# 9 9 1984
# 10 10 707
# 11 11 217
# 12 12 71
# 13 13 14
# 14 14 2
df_x <- df_x %>%
mutate(
verifs_vLow = ifelse((No_of_Verifs >= 1) & (No_of_Verifs <= 2), 1, 0),
verifs_Med = ifelse((No_of_Verifs >= 3) & (No_of_Verifs <= 5), 1, 0),
verifs_High = ifelse((No_of_Verifs >= 6) & (No_of_Verifs <= 8), 1, 0),
verifs_VHigh = ifelse(No_of_Verifs >= 9, 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
No_of_Verifs = str_count(tdf$host_verifications, ",") + 1
)
tdf <- tdf %>%
mutate(
verifs_vLow = ifelse((No_of_Verifs >= 1) & (No_of_Verifs <= 2), 1, 0),
verifs_Med = ifelse((No_of_Verifs >= 3) & (No_of_Verifs <= 5), 1, 0),
verifs_High = ifelse((No_of_Verifs >= 6) & (No_of_Verifs <= 8), 1, 0),
verifs_VHigh = ifelse(No_of_Verifs >= 9, 1, 0)
)
Create further dummies based on popular verification types -
# 1 phone 2 reviews 3 email 4 jumio 5 kba 6 facebook 7 governmentid 8 workemail 9 offlinegovernmentid 10 google
df_x <- df_x %>%
mutate(
facebookVer = ifelse(grepl("facebook", host_verifications), 1, 0),
googleVer = ifelse(grepl("google", host_verifications), 1, 0),
govtIDVer = ifelse(grepl("government_id", host_verifications), 1, 0),
jumioVer = ifelse(grepl("jumio", host_verifications), 1, 0),
kbaVer = ifelse(grepl("kba", host_verifications), 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
facebookVer = ifelse(grepl("facebook", host_verifications), 1, 0),
googleVer = ifelse(grepl("google", host_verifications), 1, 0),
govtIDVer = ifelse(grepl("government_id", host_verifications), 1, 0),
jumioVer = ifelse(grepl("jumio", host_verifications), 1, 0),
kbaVer = ifelse(grepl("kba", host_verifications), 1, 0)
)
Now, the harder one - amenities -
df_x$amenities <- as.character(df_x$amenities)
df_x$amenities <- trim.space(str_to_lower(df_x$amenities))
df_x$amenities <- str_replace_all(df_x$amenities, " ", "_")
# View(df_x$amenities)
# test data
tdf$amenities <- as.character(tdf$amenities)
tdf$amenities <- trim.space(str_to_lower(tdf$amenities))
tdf$amenities <- str_replace_all(tdf$amenities, " ", "_")
Further, let’s count the number of amenities as well as the most popular amenities -
df_x <- df_x %>%
mutate(No_of_Amens = str_count(amenities, ",") + 1)
# df_x %>%
# mutate(amens_q = ntile(No_of_Amens, 5)) %>%
# group_by(amens_q, No_of_Amens) %>%
# summarize(n = n()) %>%
# group_by(amens_q) %>%
# summarize(total = sum(n),
# min_age = min(No_of_Amens),
# max_age = max(No_of_Amens))
# 1 19999 1 13
# 2 19998 13 17
# 3 19998 17 21
# 4 19998 21 25
# 5 19998 25 78
amens_avg <- mean(df_x$No_of_Amens)
df_x$No_of_Amens <- replace_na(df_x$No_of_Amens, amens_avg)
df_x <- df_x %>%
mutate(
vlowAmens = ifelse((No_of_Amens >= 1) & (No_of_Amens < 13), 1, 0),
lowAmens = ifelse((No_of_Amens >= 13) & (No_of_Amens < 17), 1, 0),
medAmens = ifelse((No_of_Amens >= 17) & (No_of_Amens < 21), 1, 0),
highAmens = ifelse((No_of_Amens >= 21) & (No_of_Amens < 25), 1, 0),
vhighAmens = ifelse(No_of_Amens > 25, 1, 0)
)
# test data
tdf <- tdf %>%
mutate(No_of_Amens = str_count(amenities, ",") + 1)
tdf <- tdf %>%
mutate(
vlowAmens = ifelse((No_of_Amens >= 1) & (No_of_Amens < 13), 1, 0),
lowAmens = ifelse((No_of_Amens >= 13) & (No_of_Amens < 17), 1, 0),
medAmens = ifelse((No_of_Amens >= 17) & (No_of_Amens < 21), 1, 0),
highAmens = ifelse((No_of_Amens >= 21) & (No_of_Amens < 25), 1, 0),
vhighAmens = ifelse(No_of_Amens > 25, 1, 0)
)
k <- gsub(",", " ", df_x$amenities)
freq_amenities <- freq_terms(k, 50)
plot(freq_amenities)
More dummies for specific amenities -
kitchen_amens <- "kitchen|microwave|stove"
kitchen2_amens <- "coffee_maker|refrigerator"
heat_amens <- "heating"
ac_amens <- "air_conditioning"
tv_amens <- "tv|cable_tv"
wifi_amens <- "internet|wireless_internet|wifi"
pets_amens <- "pets_allowed|dogs"
luxury_amens <- "gym|hot_tub|pool|indoor_fireplace|breakfast"
petsLiveHere <- "pets_live_on_this_property"
workspace_amens <- "laptop_friendly_workspace"
parking_amens <- "free_parking_on_premises"
extrabedding <- "bed_linens|extra_pillows_and_blankets"
elevator_amens <- "elevator|elevator_in_building"
privacy_amens <- "private_entrance|lock_on_bedroom_door"
df_x[grep(kitchen_amens, df_x$amenities, value = F), "hasKitchen1"] <- 1
df_x[grep(kitchen2_amens, df_x$amenities, value = F), "hasKitchen2"] <- 1
df_x[grep(heat_amens, df_x$amenities, value = F), "hasHeating"] <- 1
df_x[grep(ac_amens, df_x$amenities, value = F), "hasAC"] <- 1
df_x[grep(tv_amens, df_x$amenities, value = F), "hasTV"] <- 1
df_x[grep(wifi_amens, df_x$amenities, value = F), "hasWiFi"] <- 1
df_x[grep(pets_amens, df_x$amenities, value = F), "PetsAllowed"] <- 1
df_x[grep(luxury_amens, df_x$amenities, value = F), "hasLuxuryAmens"] <- 1
df_x[grep(petsLiveHere, df_x$amenities, value = F), "petsLiveHere"] <- 1
df_x[grep(workspace_amens, df_x$amenities, value = F), "hasWorkspace"] <- 1
df_x[grep(parking_amens, df_x$amenities, value = F), "hasFreeParking"] <- 1
df_x[grep(extrabedding, df_x$amenities, value = F), "extraBedding"] <- 1
df_x[grep(elevator_amens, df_x$amenities, value = F), "hasElevator"] <- 1
df_x[grep(privacy_amens, df_x$amenities, value = F), "hasPrivacyAmens"] <- 1
# test data
tdf[grep(kitchen_amens, tdf$amenities, value = F), "hasKitchen1"] <- 1
tdf[grep(kitchen2_amens, tdf$amenities, value = F), "hasKitchen2"] <- 1
tdf[grep(heat_amens, tdf$amenities, value = F), "hasHeating"] <- 1
tdf[grep(ac_amens, tdf$amenities, value = F), "hasAC"] <- 1
tdf[grep(tv_amens, tdf$amenities, value = F), "hasTV"] <- 1
tdf[grep(wifi_amens, tdf$amenities, value = F), "hasWiFi"] <- 1
tdf[grep(pets_amens, tdf$amenities, value = F), "PetsAllowed"] <- 1
tdf[grep(luxury_amens, tdf$amenities, value = F), "hasLuxuryAmens"] <- 1
tdf[grep(petsLiveHere, tdf$amenities, value = F), "petsLiveHere"] <- 1
tdf[grep(workspace_amens, tdf$amenities, value = F), "hasWorkspace"] <- 1
tdf[grep(parking_amens, tdf$amenities, value = F), "hasFreeParking"] <- 1
tdf[grep(extrabedding, tdf$amenities, value = F), "extraBedding"] <- 1
tdf[grep(elevator_amens, tdf$amenities, value = F), "hasElevator"] <- 1
tdf[grep(privacy_amens, tdf$amenities, value = F), "hasPrivacyAmens"] <- 1
Replace NAs in the newly created columns with 0s
amenCols <- c("hasKitchen1", "hasKitchen2", "hasHeating", "hasAC",
"hasTV", "hasWiFi", "PetsAllowed", "hasLuxuryAmens",
"petsLiveHere", "hasWorkspace", "hasFreeParking",
"extraBedding", "hasElevator", "hasPrivacyAmens")
df_x$hasKitchen1 <- replace_na(df_x$hasKitchen1, 0)
df_x$hasKitchen2 <- replace_na(df_x$hasKitchen2, 0)
df_x$hasHeating <- replace_na(df_x$hasHeating, 0)
df_x$hasAC <- replace_na(df_x$hasAC, 0)
df_x$hasTV <- replace_na(df_x$hasTV, 0)
df_x$hasWiFi <- replace_na(df_x$hasWiFi, 0)
df_x$PetsAllowed <- replace_na(df_x$PetsAllowed, 0)
df_x$hasLuxuryAmens <- replace_na(df_x$hasLuxuryAmens, 0)
df_x$petsLiveHere <- replace_na(df_x$petsLiveHere, 0)
df_x$hasWorkspace <- replace_na(df_x$hasWorkspace, 0)
df_x$hasFreeParking <- replace_na(df_x$hasFreeParking, 0)
df_x$extraBedding <- replace_na(df_x$extraBedding, 0)
df_x$hasElevator <- replace_na(df_x$hasElevator, 0)
df_x$hasPrivacyAmens <- replace_na(df_x$hasPrivacyAmens, 0)
# test data
tdf$hasKitchen1 <- replace_na(tdf$hasKitchen1, 0)
tdf$hasKitchen2 <- replace_na(tdf$hasKitchen2, 0)
tdf$hasHeating <- replace_na(tdf$hasHeating, 0)
tdf$hasAC <- replace_na(tdf$hasAC, 0)
tdf$hasTV <- replace_na(tdf$hasTV, 0)
tdf$hasWiFi <- replace_na(tdf$hasWiFi, 0)
tdf$PetsAllowed <- replace_na(tdf$PetsAllowed, 0)
tdf$hasLuxuryAmens <- replace_na(tdf$hasLuxuryAmens, 0)
tdf$petsLiveHere <- replace_na(tdf$petsLiveHere, 0)
tdf$hasWorkspace <- replace_na(tdf$hasWorkspace, 0)
tdf$hasFreeParking <- replace_na(tdf$hasFreeParking, 0)
tdf$extraBedding <- replace_na(tdf$extraBedding, 0)
tdf$hasElevator <- replace_na(tdf$hasElevator, 0)
tdf$hasPrivacyAmens <- replace_na(tdf$hasPrivacyAmens, 0)
I’d like to see if smoking is allowed or not from the house rules column -
For that, trim spaces and convert to lowercase first -
df_x$house_rules <- as.character(df_x$house_rules)
df_x$house_rules <- trim.space(str_to_lower(df_x$house_rules))
# test data
tdf$house_rules <- as.character(tdf$house_rules)
tdf$house_rules <- trim.space(str_to_lower(tdf$house_rules))
df_x[grep("no smoking", df_x$house_rules, value = F), "NoSmoking"] <- 1
df_x$NoSmoking[is.na(df_x$NoSmoking)] <- 0
# test data
tdf[grep("no smoking", tdf$house_rules, value = F), "NoSmoking"] <- 1
tdf$NoSmoking[is.na(tdf$NoSmoking)] <- 0
Is host from USA?
df_x$host_location <- trim.space(str_to_lower(df_x$host_location))
# test data
tdf$host_location <- trim.space(str_to_lower(tdf$host_location))
df_x <- df_x %>%
mutate(
isHost_in_US = ifelse(grepl("united states", host_location), 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
isHost_in_US = ifelse(grepl("united states", host_location), 1, 0)
)
More processing on the numerical columns now -
accommodates availability_30 availability_365 availability_60 availability_90 bathrooms bedrooms beds cleaning_fee extra_people guests_included
host_listings_count host_response_rate host_total_listings_count
maximum_nights minimum_nights
price
# View(table(df_x$guests_included)) There are some 0s
# View(table(df_x$accommodates)) There are no 0s
# Replacing the 0s and NAs in guests_included with 1s, the most popular category
df_x$guests_included[df_x$guests_included == 0] <- 1
df_x$guests_included <- replace_na(df_x$guests_included, 1)
# test data
tdf$guests_included[tdf$guests_included == 0] <- 1
tdf$guests_included <- replace_na(tdf$guests_included, 1)
# The price can also include the cleaning fee
# summary(df_x$cleaning_fee) - median is 50
df_x$cleaning_fee <- replace_na(df_x$cleaning_fee, 50)
# test data
tdf$cleaning_fee <- replace_na(tdf$cleaning_fee, 50)
# The security deposit has 40K NAs
# summary(df_x$security_deposit)
df_x$security_deposit <- replace_na(df_x$security_deposit, 200)
# test data
tdf$security_deposit <- replace_na(tdf$security_deposit, 200)
df_x <- df_x %>%
mutate(
bathroomGuestRatio = bathrooms/accommodates,
bedroomsGuestRatio = bedrooms/accommodates,
bedsGuestsRatio = beds/accommodates,
pricePerGuest = (price + cleaning_fee)/guests_included,
sec_dep_perc = ifelse(price >=1 , (100*security_deposit)/price, 60)
)
# test data
tdf <- tdf %>%
mutate(
bathroomGuestRatio = bathrooms/accommodates,
bedroomsGuestRatio = bedrooms/accommodates,
bedsGuestsRatio = beds/accommodates,
pricePerGuest = (price + cleaning_fee)/guests_included,
sec_dep_perc = ifelse(price >=1 , (100*security_deposit)/price, 60)
)
There are some sec_dep_perc that are greater than 100!
Let’s change those -
df_x$sec_dep_perc[df_x$sec_dep_perc > 100] <- 60
# test data
tdf$sec_dep_perc[tdf$sec_dep_perc > 100] <- 60
Now, we can bin the security deposits -
df_x <- df_x %>%
mutate(
secDep0 = ifelse(sec_dep_perc == 0, 1, 0),
secDepGen = ifelse((sec_dep_perc > 0) & (sec_dep_perc <= 60), 1, 0),
secDepHigh = ifelse(sec_dep_perc > 60, 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
secDep0 = ifelse(sec_dep_perc == 0, 1, 0),
secDepGen = ifelse((sec_dep_perc > 0) & (sec_dep_perc <= 60), 1, 0),
secDepHigh = ifelse(sec_dep_perc > 60, 1, 0)
)
The pricePerGuest can be further divided into dummies based on quantiles -
# df_x %>%
# mutate(pPG_q = ntile(pricePerGuest, 5)) %>%
# group_by(pPG_q, pricePerGuest) %>%
# summarize(n = n()) %>%
# group_by(pPG_q) %>%
# summarize(total = sum(n),
# min_age = min(pricePerGuest),
# max_age = max(pricePerGuest))
#
# summary(df_x$pricePerGuest)
df_x <- df_x %>%
mutate(
vlow_pPG = ifelse((pricePerGuest >= 0) & (pricePerGuest < 44), 1, 0),
low_pPG = ifelse((pricePerGuest >= 44) & (pricePerGuest < 87.5), 1, 0),
med_pPG = ifelse((pricePerGuest >= 87.5) & (pricePerGuest < 250), 1, 0),
high_pPG = ifelse((pricePerGuest >= 250) & (pricePerGuest < 1000), 1, 0),
vHigh_pPG = ifelse(pricePerGuest >= 1000, 1, 0)
)
# test data
tdf <- tdf%>%
mutate(
vlow_pPG = ifelse((pricePerGuest >= 0) & (pricePerGuest < 44), 1, 0),
low_pPG = ifelse((pricePerGuest >= 44) & (pricePerGuest < 87.5), 1, 0),
med_pPG = ifelse((pricePerGuest >= 87.5) & (pricePerGuest < 250), 1, 0),
high_pPG = ifelse((pricePerGuest >= 250) & (pricePerGuest < 1000), 1, 0),
vHigh_pPG = ifelse(pricePerGuest >= 1000, 1, 0)
)
Making dummies of bathroomGuestRatio, bedsGuestRatio, and bedroomsGuestRation -
First, bedsGuestsRatio -
# df_x %>%
# mutate(bGR_q = ntile(bedsGuestsRatio, 5)) %>%
# group_by(bGR_q, bedsGuestsRatio) %>%
# summarize(n = n()) %>%
# group_by(bGR_q) %>%
# summarize(total = sum(n),
# min_age = min(bedsGuestsRatio),
# max_age = max(bedsGuestsRatio))
#
# summary(df_x$bedsGuestsRatio)
df_x$bedsGuestsRatio <- replace_na(df_x$bedsGuestsRatio, 1)
# test data
tdf$bedsGuestsRatio <- replace_na(tdf$bedsGuestsRatio, 1)
df_x <- df_x %>%
mutate(
beds_Comfy = ifelse((bedsGuestsRatio >= 0.7333333) & (bedsGuestsRatio < 1.3), 1, 0),
beds_Cramped = ifelse(bedsGuestsRatio < 0.7333333, 1, 0),
beds_Excess = ifelse(bedsGuestsRatio > 1.3, 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
beds_Comfy = ifelse((bedsGuestsRatio >= 0.7333333) & (bedsGuestsRatio < 1.3), 1, 0),
beds_Cramped = ifelse(bedsGuestsRatio < 0.7333333, 1, 0),
beds_Excess = ifelse(bedsGuestsRatio > 1.3, 1, 0)
)
Second, bathroomGuestRatio
# df_x %>%
# mutate(baGR_q = ntile(bathroomGuestRatio, 5)) %>%
# group_by(baGR_q, bathroomGuestRatio) %>%
# summarize(n = n()) %>%
# group_by(baGR_q) %>%
# summarize(total = sum(n),
# min_age = min(bathroomGuestRatio),
# max_age = max(bathroomGuestRatio))
#
# summary(df_x$bathroomGuestRatio)
df_x$bathroomGuestRatio <- replace_na(df_x$bedsGuestsRatio, 0.7)
# test data
tdf$bathroomGuestRatio <- replace_na(tdf$bathroomGuestRatio, 0.7)
df_x <- df_x %>%
mutate(
baths_Comfy = ifelse((bedsGuestsRatio >= 0.5) & (bedsGuestsRatio < 1.2), 1, 0),
baths_Cramped = ifelse(bedsGuestsRatio < 0.5, 1, 0),
baths_Excess = ifelse(bedsGuestsRatio > 1.2, 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
baths_Comfy = ifelse((bedsGuestsRatio >= 0.5) & (bedsGuestsRatio < 1.2), 1, 0),
baths_Cramped = ifelse(bedsGuestsRatio < 0.5, 1, 0),
baths_Excess = ifelse(bedsGuestsRatio > 1.2, 1, 0)
)
Finally, bedroomsGuestRation
# summary(df_x$bedroomsGuestRatio)
df_x$bedroomsGuestRatio <- replace_na(df_x$bedroomsGuestRatio, 0.5)
# test data
tdf$bedroomsGuestRatio <- replace_na(tdf$bedroomsGuestRatio, 0.5)
df_x <- df_x %>%
mutate(
bedrooms_Comfy = ifelse((bedroomsGuestRatio >= 0.5) & (bedroomsGuestRatio < 1.2), 1, 0),
bedrooms_Cramped = ifelse(bedroomsGuestRatio < 0.5, 1, 0),
bedrooms_Excess = ifelse(bedroomsGuestRatio > 1.2, 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
bedrooms_Comfy = ifelse((bedroomsGuestRatio >= 0.5) & (bedroomsGuestRatio < 1.2), 1, 0),
bedrooms_Cramped = ifelse(bedroomsGuestRatio < 0.5, 1, 0),
bedrooms_Excess = ifelse(bedroomsGuestRatio > 1.2, 1, 0)
)
There are columns with text in them -
Let’s treat the numerical columns which we are not going to be binning -
Scale availabilities and accommodates -
# summary(df_x$availability_30)
# summary(df_x$availability_60)
# summary(df_x$availability_90)
avail_vars <- df_x %>%
select(availability_30, availability_60, availability_90, availability_365, accommodates)
processed_vars <- preProcess(avail_vars, method = c("YeoJohnson"))
df_x <- predict(processed_vars, df_x)
# test data
tdf <- predict(preProcess(tdf %>% select(availability_30, availability_60, availability_90, availability_365, accommodates), method = c("YeoJohnson")), tdf)
Let’s take a look at cleaning fee and replace NAs with the median
qplot(df_x$cleaning_fee)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(df_x$security_deposit)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df_x$cleaning_fee <- replace_na(df_x$cleaning_fee, 50)
summary(df_x$cleaning_fee)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 30.00 50.00 63.88 80.00 1000.00
Create standardized columns for price, cleaning_fee, security_deposit, maximum_nights
processed_vars <- preProcess(df_x %>% select(price, extra_people, maximum_nights), method = "YeoJohnson")
df_x <- predict(processed_vars, df_x)
# test data
tdf <- predict(preProcess(tdf %>% select(price, extra_people, maximum_nights), method = "YeoJohnson"), tdf)
Minimum Nights dummies
# summary(df_x$minimum_nights)
# View(table(df_x$minimum_nights))
df_x$minimum_nights[df_x$minimum_nights > 1250] <- 100 # random
# summary(df_x$minimum_nights)
df_x <- df_x %>%
mutate(
minNigh1 = ifelse(minimum_nights == 1, 1, 0),
minNigh2 = ifelse(minimum_nights == 2, 1, 0),
minNigh3 = ifelse(minimum_nights == 3, 1, 0),
minNight_Excess = ifelse(minimum_nights > 3, 1, 0)
)
# test data
tdf <- tdf %>%
mutate(
minNigh1 = ifelse(minimum_nights == 1, 1, 0),
minNigh2 = ifelse(minimum_nights == 2, 1, 0),
minNigh3 = ifelse(minimum_nights == 3, 1, 0),
minNight_Excess = ifelse(minimum_nights > 3, 1, 0)
)
Dummies for presence of cleaning fee and security deposit -
(Commented because combined cleaning_fee with price and created dummies for security_deposit)
# df_x <- df_x %>%
# mutate(
# isCleaningFee = ifelse(is.na(cleaning_fee) | cleaning_fee == 0, 0, 1),
# isSecurityDeposit = ifelse(is.na(security_deposit) | security_deposit == 0, 0, 1)
# )
# # test data
# tdf <- tdf %>%
# mutate(
# isCleaningFee = ifelse(is.na(cleaning_fee) | cleaning_fee == 0, 0, 1),
# isSecurityDeposit = ifelse(is.na(security_deposit) | security_deposit == 0, 0, 1)
# )
host_response_rate, host_listings_count, host_total_listings_count
# qplot(df_x_main$host_listings_count, df_x_main$host_total_listings_count)
# Both are same
df_x$host_response_rate <- replace_na(df_x$host_response_rate, 100)
# test data
tdf$host_response_rate <- replace_na(tdf$host_response_rate, 100)
# Scale
df_x <- predict(preProcess(df_x %>% select(host_listings_count, host_response_rate), method = c("YeoJohnson")), df_x)
# test data
tdf <- predict(preProcess(tdf %>% select(host_listings_count, host_response_rate), method = c("YeoJohnson")), tdf)